Blame - aom_dsp/fwd_txfm.c - avm

blob: 12745abb3df2c455f81fe121bc48cfa214ab9ad5 [file] [log] [blame]

Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	1	/*
Adrian Grange	a872b06	2016-03-24 11:38:32 -0700	[diff] [blame]	2	* Copyright (c) 2016, Alliance for Open Media. All rights reserved
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	3	*
Adrian Grange	a872b06	2016-03-24 11:38:32 -0700	[diff] [blame]	4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	10	*/
				11
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	12	#include <assert.h>
Yaowu Xu	bf4202e	2016-03-21 15:15:19 -0700	[diff] [blame]	13	#include "aom_dsp/fwd_txfm.h"
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	14
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	15	void aom_fdct4x4_c(const int16_t input, tran_low_t output, int stride) {
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	16	// The 2D transform is done with two passes which are actually pretty
				17	// similar. In the first one, we transform the columns and transpose
				18	// the results. In the second one, we transform the rows. To achieve that,
				19	// as the first pass results are transposed, we transpose the columns (that
				20	// is the transposed rows) and transpose the results (so that it goes back
				21	// in normal/row positions).
				22	int pass;
				23	// We need an intermediate buffer between passes.
				24	tran_low_t intermediate[4 * 4];
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	25	const tran_low_t *in_low = NULL;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	26	tran_low_t *out = intermediate;
				27	// Do the two transform/transpose passes
				28	for (pass = 0; pass < 2; ++pass) {
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	29	tran_high_t in_high[4]; // canbe16
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	30	tran_high_t step[4]; // canbe16
				31	tran_high_t temp1, temp2; // needs32
				32	int i;
				33	for (i = 0; i < 4; ++i) {
				34	// Load inputs.
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	35	if (pass == 0) {
				36	in_high[0] = input[0 * stride] * 16;
				37	in_high[1] = input[1 * stride] * 16;
				38	in_high[2] = input[2 * stride] * 16;
				39	in_high[3] = input[3 * stride] * 16;
				40	if (i == 0 && in_high[0]) {
				41	++in_high[0];
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	42	}
				43	} else {
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	44	assert(in_low != NULL);
				45	in_high[0] = in_low[0 * 4];
				46	in_high[1] = in_low[1 * 4];
				47	in_high[2] = in_low[2 * 4];
				48	in_high[3] = in_low[3 * 4];
				49	++in_low;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	50	}
				51	// Transform.
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	52	step[0] = in_high[0] + in_high[3];
				53	step[1] = in_high[1] + in_high[2];
				54	step[2] = in_high[1] - in_high[2];
				55	step[3] = in_high[0] - in_high[3];
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	56	temp1 = (step[0] + step[1]) * cospi_16_64;
				57	temp2 = (step[0] - step[1]) * cospi_16_64;
				58	out[0] = (tran_low_t)fdct_round_shift(temp1);
				59	out[2] = (tran_low_t)fdct_round_shift(temp2);
				60	temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
				61	temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
				62	out[1] = (tran_low_t)fdct_round_shift(temp1);
				63	out[3] = (tran_low_t)fdct_round_shift(temp2);
				64	// Do next column (which is a transposed row in second/horizontal pass)
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	65	++input;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	66	out += 4;
				67	}
				68	// Setup in/out for next pass.
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	69	in_low = intermediate;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	70	out = output;
				71	}
				72
				73	{
				74	int i, j;
				75	for (i = 0; i < 4; ++i) {
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	76	for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	77	}
				78	}
				79	}
				80
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	81	void aom_fdct4x4_1_c(const int16_t input, tran_low_t output, int stride) {
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	82	int r, c;
				83	tran_low_t sum = 0;
				84	for (r = 0; r < 4; ++r)
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	85	for (c = 0; c < 4; ++c) sum += input[r * stride + c];
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	86
				87	output[0] = sum << 1;
				88	output[1] = 0;
				89	}
				90
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	91	void aom_fdct8x8_c(const int16_t input, tran_low_t final_output, int stride) {
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	92	int i, j;
				93	tran_low_t intermediate[64];
				94	int pass;
				95	tran_low_t *output = intermediate;
				96	const tran_low_t *in = NULL;
				97
				98	// Transform columns
				99	for (pass = 0; pass < 2; ++pass) {
				100	tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
				101	tran_high_t t0, t1, t2, t3; // needs32
				102	tran_high_t x0, x1, x2, x3; // canbe16
				103
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	104	for (i = 0; i < 8; i++) {
				105	// stage 1
				106	if (pass == 0) {
				107	s0 = (input[0 * stride] + input[7 * stride]) * 4;
				108	s1 = (input[1 * stride] + input[6 * stride]) * 4;
				109	s2 = (input[2 * stride] + input[5 * stride]) * 4;
				110	s3 = (input[3 * stride] + input[4 * stride]) * 4;
				111	s4 = (input[3 * stride] - input[4 * stride]) * 4;
				112	s5 = (input[2 * stride] - input[5 * stride]) * 4;
				113	s6 = (input[1 * stride] - input[6 * stride]) * 4;
				114	s7 = (input[0 * stride] - input[7 * stride]) * 4;
				115	++input;
				116	} else {
				117	s0 = in[0 * 8] + in[7 * 8];
				118	s1 = in[1 * 8] + in[6 * 8];
				119	s2 = in[2 * 8] + in[5 * 8];
				120	s3 = in[3 * 8] + in[4 * 8];
				121	s4 = in[3 * 8] - in[4 * 8];
				122	s5 = in[2 * 8] - in[5 * 8];
				123	s6 = in[1 * 8] - in[6 * 8];
				124	s7 = in[0 * 8] - in[7 * 8];
				125	++in;
				126	}
				127
				128	// fdct4(step, step);
				129	x0 = s0 + s3;
				130	x1 = s1 + s2;
				131	x2 = s1 - s2;
				132	x3 = s0 - s3;
				133	t0 = (x0 + x1) * cospi_16_64;
				134	t1 = (x0 - x1) * cospi_16_64;
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	135	t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
				136	t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	137	output[0] = (tran_low_t)fdct_round_shift(t0);
				138	output[2] = (tran_low_t)fdct_round_shift(t2);
				139	output[4] = (tran_low_t)fdct_round_shift(t1);
				140	output[6] = (tran_low_t)fdct_round_shift(t3);
				141
				142	// Stage 2
				143	t0 = (s6 - s5) * cospi_16_64;
				144	t1 = (s6 + s5) * cospi_16_64;
				145	t2 = fdct_round_shift(t0);
				146	t3 = fdct_round_shift(t1);
				147
				148	// Stage 3
				149	x0 = s4 + t2;
				150	x1 = s4 - t2;
				151	x2 = s7 - t3;
				152	x3 = s7 + t3;
				153
				154	// Stage 4
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	155	t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
				156	t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	157	t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	158	t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	159	output[1] = (tran_low_t)fdct_round_shift(t0);
				160	output[3] = (tran_low_t)fdct_round_shift(t2);
				161	output[5] = (tran_low_t)fdct_round_shift(t1);
				162	output[7] = (tran_low_t)fdct_round_shift(t3);
				163	output += 8;
				164	}
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	165	in = intermediate;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	166	output = final_output;
				167	}
				168
				169	// Rows
				170	for (i = 0; i < 8; ++i) {
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	171	for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	172	}
				173	}
				174
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	175	void aom_fdct8x8_1_c(const int16_t input, tran_low_t output, int stride) {
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	176	int r, c;
				177	tran_low_t sum = 0;
				178	for (r = 0; r < 8; ++r)
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	179	for (c = 0; c < 8; ++c) sum += input[r * stride + c];
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	180
				181	output[0] = sum;
				182	output[1] = 0;
				183	}
				184
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	185	void aom_fdct16x16_c(const int16_t input, tran_low_t output, int stride) {
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	186	// The 2D transform is done with two passes which are actually pretty
				187	// similar. In the first one, we transform the columns and transpose
				188	// the results. In the second one, we transform the rows. To achieve that,
				189	// as the first pass results are transposed, we transpose the columns (that
				190	// is the transposed rows) and transpose the results (so that it goes back
				191	// in normal/row positions).
				192	int pass;
				193	// We need an intermediate buffer between passes.
				194	tran_low_t intermediate[256];
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	195	const tran_low_t *in_low = NULL;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	196	tran_low_t *out = intermediate;
				197	// Do the two transform/transpose passes
				198	for (pass = 0; pass < 2; ++pass) {
				199	tran_high_t step1[8]; // canbe16
				200	tran_high_t step2[8]; // canbe16
				201	tran_high_t step3[8]; // canbe16
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	202	tran_high_t in_high[8]; // canbe16
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	203	tran_high_t temp1, temp2; // needs32
				204	int i;
				205	for (i = 0; i < 16; i++) {
				206	if (0 == pass) {
				207	// Calculate input for the first 8 results.
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	208	in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
				209	in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
				210	in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
				211	in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
				212	in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
				213	in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
				214	in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
				215	in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	216	// Calculate input for the next 8 results.
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	217	step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
				218	step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
				219	step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
				220	step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
				221	step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
				222	step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
				223	step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
				224	step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	225	} else {
				226	// Calculate input for the first 8 results.
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	227	assert(in_low != NULL);
				228	in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
				229	in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
				230	in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
				231	in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
				232	in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
				233	in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
				234	in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
				235	in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	236	// Calculate input for the next 8 results.
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	237	step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
				238	step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
				239	step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
				240	step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
				241	step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
				242	step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
				243	step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
				244	step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
				245	in_low++;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	246	}
				247	// Work on the first eight values; fdct8(input, even_results);
				248	{
				249	tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
				250	tran_high_t t0, t1, t2, t3; // needs32
				251	tran_high_t x0, x1, x2, x3; // canbe16
				252
				253	// stage 1
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	254	s0 = in_high[0] + in_high[7];
				255	s1 = in_high[1] + in_high[6];
				256	s2 = in_high[2] + in_high[5];
				257	s3 = in_high[3] + in_high[4];
				258	s4 = in_high[3] - in_high[4];
				259	s5 = in_high[2] - in_high[5];
				260	s6 = in_high[1] - in_high[6];
				261	s7 = in_high[0] - in_high[7];
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	262
				263	// fdct4(step, step);
				264	x0 = s0 + s3;
				265	x1 = s1 + s2;
				266	x2 = s1 - s2;
				267	x3 = s0 - s3;
				268	t0 = (x0 + x1) * cospi_16_64;
				269	t1 = (x0 - x1) * cospi_16_64;
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	270	t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	271	t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
				272	out[0] = (tran_low_t)fdct_round_shift(t0);
				273	out[4] = (tran_low_t)fdct_round_shift(t2);
				274	out[8] = (tran_low_t)fdct_round_shift(t1);
				275	out[12] = (tran_low_t)fdct_round_shift(t3);
				276
				277	// Stage 2
				278	t0 = (s6 - s5) * cospi_16_64;
				279	t1 = (s6 + s5) * cospi_16_64;
				280	t2 = fdct_round_shift(t0);
				281	t3 = fdct_round_shift(t1);
				282
				283	// Stage 3
				284	x0 = s4 + t2;
				285	x1 = s4 - t2;
				286	x2 = s7 - t3;
				287	x3 = s7 + t3;
				288
				289	// Stage 4
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	290	t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
				291	t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	292	t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	293	t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	294	out[2] = (tran_low_t)fdct_round_shift(t0);
				295	out[6] = (tran_low_t)fdct_round_shift(t2);
				296	out[10] = (tran_low_t)fdct_round_shift(t1);
				297	out[14] = (tran_low_t)fdct_round_shift(t3);
				298	}
				299	// Work on the next eight values; step1 -> odd_results
				300	{
				301	// step 2
				302	temp1 = (step1[5] - step1[2]) * cospi_16_64;
				303	temp2 = (step1[4] - step1[3]) * cospi_16_64;
				304	step2[2] = fdct_round_shift(temp1);
				305	step2[3] = fdct_round_shift(temp2);
				306	temp1 = (step1[4] + step1[3]) * cospi_16_64;
				307	temp2 = (step1[5] + step1[2]) * cospi_16_64;
				308	step2[4] = fdct_round_shift(temp1);
				309	step2[5] = fdct_round_shift(temp2);
				310	// step 3
				311	step3[0] = step1[0] + step2[3];
				312	step3[1] = step1[1] + step2[2];
				313	step3[2] = step1[1] - step2[2];
				314	step3[3] = step1[0] - step2[3];
				315	step3[4] = step1[7] - step2[4];
				316	step3[5] = step1[6] - step2[5];
				317	step3[6] = step1[6] + step2[5];
				318	step3[7] = step1[7] + step2[4];
				319	// step 4
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	320	temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
				321	temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	322	step2[1] = fdct_round_shift(temp1);
				323	step2[2] = fdct_round_shift(temp2);
				324	temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	325	temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	326	step2[5] = fdct_round_shift(temp1);
				327	step2[6] = fdct_round_shift(temp2);
				328	// step 5
				329	step1[0] = step3[0] + step2[1];
				330	step1[1] = step3[0] - step2[1];
				331	step1[2] = step3[3] + step2[2];
				332	step1[3] = step3[3] - step2[2];
				333	step1[4] = step3[4] - step2[5];
				334	step1[5] = step3[4] + step2[5];
				335	step1[6] = step3[7] - step2[6];
				336	step1[7] = step3[7] + step2[6];
				337	// step 6
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	338	temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	339	temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
				340	out[1] = (tran_low_t)fdct_round_shift(temp1);
				341	out[9] = (tran_low_t)fdct_round_shift(temp2);
				342	temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	343	temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	344	out[5] = (tran_low_t)fdct_round_shift(temp1);
				345	out[13] = (tran_low_t)fdct_round_shift(temp2);
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	346	temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	347	temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
				348	out[3] = (tran_low_t)fdct_round_shift(temp1);
				349	out[11] = (tran_low_t)fdct_round_shift(temp2);
				350	temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	351	temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	352	out[7] = (tran_low_t)fdct_round_shift(temp1);
				353	out[15] = (tran_low_t)fdct_round_shift(temp2);
				354	}
				355	// Do next column (which is a transposed row in second/horizontal pass)
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	356	input++;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	357	out += 16;
				358	}
				359	// Setup in/out for next pass.
Urvang Joshi	09eea21	2016-07-14 11:40:38 -0700	[diff] [blame]	360	in_low = intermediate;
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	361	out = output;
				362	}
				363	}
				364
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	365	void aom_fdct16x16_1_c(const int16_t input, tran_low_t output, int stride) {
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	366	int r, c;
				367	tran_low_t sum = 0;
				368	for (r = 0; r < 16; ++r)
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	369	for (c = 0; c < 16; ++c) sum += input[r * stride + c];
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	370
				371	output[0] = sum >> 1;
				372	output[1] = 0;
				373	}
				374
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	375	static INLINE tran_high_t dct_32_round(tran_high_t input) {
				376	tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
				377	// TODO(debargha, peter.derivaz): Find new bounds for this assert,
				378	// and make the bounds consts.
				379	// assert(-131072 <= rv && rv <= 131071);
				380	return rv;
				381	}
				382
				383	static INLINE tran_high_t half_round_shift(tran_high_t input) {
				384	tran_high_t rv = (input + 1 + (input < 0)) >> 2;
				385	return rv;
				386	}
				387
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	388	void aom_fdct32(const tran_high_t input, tran_high_t output, int round) {
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	389	tran_high_t step[32];
				390	// Stage 1
				391	step[0] = input[0] + input[(32 - 1)];
				392	step[1] = input[1] + input[(32 - 2)];
				393	step[2] = input[2] + input[(32 - 3)];
				394	step[3] = input[3] + input[(32 - 4)];
				395	step[4] = input[4] + input[(32 - 5)];
				396	step[5] = input[5] + input[(32 - 6)];
				397	step[6] = input[6] + input[(32 - 7)];
				398	step[7] = input[7] + input[(32 - 8)];
				399	step[8] = input[8] + input[(32 - 9)];
				400	step[9] = input[9] + input[(32 - 10)];
				401	step[10] = input[10] + input[(32 - 11)];
				402	step[11] = input[11] + input[(32 - 12)];
				403	step[12] = input[12] + input[(32 - 13)];
				404	step[13] = input[13] + input[(32 - 14)];
				405	step[14] = input[14] + input[(32 - 15)];
				406	step[15] = input[15] + input[(32 - 16)];
				407	step[16] = -input[16] + input[(32 - 17)];
				408	step[17] = -input[17] + input[(32 - 18)];
				409	step[18] = -input[18] + input[(32 - 19)];
				410	step[19] = -input[19] + input[(32 - 20)];
				411	step[20] = -input[20] + input[(32 - 21)];
				412	step[21] = -input[21] + input[(32 - 22)];
				413	step[22] = -input[22] + input[(32 - 23)];
				414	step[23] = -input[23] + input[(32 - 24)];
				415	step[24] = -input[24] + input[(32 - 25)];
				416	step[25] = -input[25] + input[(32 - 26)];
				417	step[26] = -input[26] + input[(32 - 27)];
				418	step[27] = -input[27] + input[(32 - 28)];
				419	step[28] = -input[28] + input[(32 - 29)];
				420	step[29] = -input[29] + input[(32 - 30)];
				421	step[30] = -input[30] + input[(32 - 31)];
				422	step[31] = -input[31] + input[(32 - 32)];
				423
				424	// Stage 2
				425	output[0] = step[0] + step[16 - 1];
				426	output[1] = step[1] + step[16 - 2];
				427	output[2] = step[2] + step[16 - 3];
				428	output[3] = step[3] + step[16 - 4];
				429	output[4] = step[4] + step[16 - 5];
				430	output[5] = step[5] + step[16 - 6];
				431	output[6] = step[6] + step[16 - 7];
				432	output[7] = step[7] + step[16 - 8];
				433	output[8] = -step[8] + step[16 - 9];
				434	output[9] = -step[9] + step[16 - 10];
				435	output[10] = -step[10] + step[16 - 11];
				436	output[11] = -step[11] + step[16 - 12];
				437	output[12] = -step[12] + step[16 - 13];
				438	output[13] = -step[13] + step[16 - 14];
				439	output[14] = -step[14] + step[16 - 15];
				440	output[15] = -step[15] + step[16 - 16];
				441
				442	output[16] = step[16];
				443	output[17] = step[17];
				444	output[18] = step[18];
				445	output[19] = step[19];
				446
				447	output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
				448	output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
				449	output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
				450	output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
				451
				452	output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
				453	output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
				454	output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
				455	output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
				456
				457	output[28] = step[28];
				458	output[29] = step[29];
				459	output[30] = step[30];
				460	output[31] = step[31];
				461
				462	// dump the magnitude by 4, hence the intermediate values are within
				463	// the range of 16 bits.
				464	if (round) {
				465	output[0] = half_round_shift(output[0]);
				466	output[1] = half_round_shift(output[1]);
				467	output[2] = half_round_shift(output[2]);
				468	output[3] = half_round_shift(output[3]);
				469	output[4] = half_round_shift(output[4]);
				470	output[5] = half_round_shift(output[5]);
				471	output[6] = half_round_shift(output[6]);
				472	output[7] = half_round_shift(output[7]);
				473	output[8] = half_round_shift(output[8]);
				474	output[9] = half_round_shift(output[9]);
				475	output[10] = half_round_shift(output[10]);
				476	output[11] = half_round_shift(output[11]);
				477	output[12] = half_round_shift(output[12]);
				478	output[13] = half_round_shift(output[13]);
				479	output[14] = half_round_shift(output[14]);
				480	output[15] = half_round_shift(output[15]);
				481
				482	output[16] = half_round_shift(output[16]);
				483	output[17] = half_round_shift(output[17]);
				484	output[18] = half_round_shift(output[18]);
				485	output[19] = half_round_shift(output[19]);
				486	output[20] = half_round_shift(output[20]);
				487	output[21] = half_round_shift(output[21]);
				488	output[22] = half_round_shift(output[22]);
				489	output[23] = half_round_shift(output[23]);
				490	output[24] = half_round_shift(output[24]);
				491	output[25] = half_round_shift(output[25]);
				492	output[26] = half_round_shift(output[26]);
				493	output[27] = half_round_shift(output[27]);
				494	output[28] = half_round_shift(output[28]);
				495	output[29] = half_round_shift(output[29]);
				496	output[30] = half_round_shift(output[30]);
				497	output[31] = half_round_shift(output[31]);
				498	}
				499
				500	// Stage 3
				501	step[0] = output[0] + output[(8 - 1)];
				502	step[1] = output[1] + output[(8 - 2)];
				503	step[2] = output[2] + output[(8 - 3)];
				504	step[3] = output[3] + output[(8 - 4)];
				505	step[4] = -output[4] + output[(8 - 5)];
				506	step[5] = -output[5] + output[(8 - 6)];
				507	step[6] = -output[6] + output[(8 - 7)];
				508	step[7] = -output[7] + output[(8 - 8)];
				509	step[8] = output[8];
				510	step[9] = output[9];
				511	step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
				512	step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
				513	step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
				514	step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
				515	step[14] = output[14];
				516	step[15] = output[15];
				517
				518	step[16] = output[16] + output[23];
				519	step[17] = output[17] + output[22];
				520	step[18] = output[18] + output[21];
				521	step[19] = output[19] + output[20];
				522	step[20] = -output[20] + output[19];
				523	step[21] = -output[21] + output[18];
				524	step[22] = -output[22] + output[17];
				525	step[23] = -output[23] + output[16];
				526	step[24] = -output[24] + output[31];
				527	step[25] = -output[25] + output[30];
				528	step[26] = -output[26] + output[29];
				529	step[27] = -output[27] + output[28];
				530	step[28] = output[28] + output[27];
				531	step[29] = output[29] + output[26];
				532	step[30] = output[30] + output[25];
				533	step[31] = output[31] + output[24];
				534
				535	// Stage 4
				536	output[0] = step[0] + step[3];
				537	output[1] = step[1] + step[2];
				538	output[2] = -step[2] + step[1];
				539	output[3] = -step[3] + step[0];
				540	output[4] = step[4];
				541	output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
				542	output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
				543	output[7] = step[7];
				544	output[8] = step[8] + step[11];
				545	output[9] = step[9] + step[10];
				546	output[10] = -step[10] + step[9];
				547	output[11] = -step[11] + step[8];
				548	output[12] = -step[12] + step[15];
				549	output[13] = -step[13] + step[14];
				550	output[14] = step[14] + step[13];
				551	output[15] = step[15] + step[12];
				552
				553	output[16] = step[16];
				554	output[17] = step[17];
				555	output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
				556	output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
				557	output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
				558	output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
				559	output[22] = step[22];
				560	output[23] = step[23];
				561	output[24] = step[24];
				562	output[25] = step[25];
				563	output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
				564	output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
				565	output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
				566	output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
				567	output[30] = step[30];
				568	output[31] = step[31];
				569
				570	// Stage 5
				571	step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
				572	step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
				573	step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
				574	step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
				575	step[4] = output[4] + output[5];
				576	step[5] = -output[5] + output[4];
				577	step[6] = -output[6] + output[7];
				578	step[7] = output[7] + output[6];
				579	step[8] = output[8];
				580	step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
				581	step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
				582	step[11] = output[11];
				583	step[12] = output[12];
				584	step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
				585	step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
				586	step[15] = output[15];
				587
				588	step[16] = output[16] + output[19];
				589	step[17] = output[17] + output[18];
				590	step[18] = -output[18] + output[17];
				591	step[19] = -output[19] + output[16];
				592	step[20] = -output[20] + output[23];
				593	step[21] = -output[21] + output[22];
				594	step[22] = output[22] + output[21];
				595	step[23] = output[23] + output[20];
				596	step[24] = output[24] + output[27];
				597	step[25] = output[25] + output[26];
				598	step[26] = -output[26] + output[25];
				599	step[27] = -output[27] + output[24];
				600	step[28] = -output[28] + output[31];
				601	step[29] = -output[29] + output[30];
				602	step[30] = output[30] + output[29];
				603	step[31] = output[31] + output[28];
				604
				605	// Stage 6
				606	output[0] = step[0];
				607	output[1] = step[1];
				608	output[2] = step[2];
				609	output[3] = step[3];
				610	output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
				611	output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
				612	output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
				613	output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
				614	output[8] = step[8] + step[9];
				615	output[9] = -step[9] + step[8];
				616	output[10] = -step[10] + step[11];
				617	output[11] = step[11] + step[10];
				618	output[12] = step[12] + step[13];
				619	output[13] = -step[13] + step[12];
				620	output[14] = -step[14] + step[15];
				621	output[15] = step[15] + step[14];
				622
				623	output[16] = step[16];
				624	output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
				625	output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
				626	output[19] = step[19];
				627	output[20] = step[20];
				628	output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
				629	output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
				630	output[23] = step[23];
				631	output[24] = step[24];
				632	output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
				633	output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
				634	output[27] = step[27];
				635	output[28] = step[28];
				636	output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
				637	output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
				638	output[31] = step[31];
				639
				640	// Stage 7
				641	step[0] = output[0];
				642	step[1] = output[1];
				643	step[2] = output[2];
				644	step[3] = output[3];
				645	step[4] = output[4];
				646	step[5] = output[5];
				647	step[6] = output[6];
				648	step[7] = output[7];
				649	step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
				650	step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
				651	step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
				652	step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
				653	step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
				654	step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
				655	step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
				656	step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
				657
				658	step[16] = output[16] + output[17];
				659	step[17] = -output[17] + output[16];
				660	step[18] = -output[18] + output[19];
				661	step[19] = output[19] + output[18];
				662	step[20] = output[20] + output[21];
				663	step[21] = -output[21] + output[20];
				664	step[22] = -output[22] + output[23];
				665	step[23] = output[23] + output[22];
				666	step[24] = output[24] + output[25];
				667	step[25] = -output[25] + output[24];
				668	step[26] = -output[26] + output[27];
				669	step[27] = output[27] + output[26];
				670	step[28] = output[28] + output[29];
				671	step[29] = -output[29] + output[28];
				672	step[30] = -output[30] + output[31];
				673	step[31] = output[31] + output[30];
				674
				675	// Final stage --- outputs indices are bit-reversed.
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	676	output[0] = step[0];
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	677	output[16] = step[1];
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	678	output[8] = step[2];
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	679	output[24] = step[3];
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	680	output[4] = step[4];
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	681	output[20] = step[5];
				682	output[12] = step[6];
				683	output[28] = step[7];
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	684	output[2] = step[8];
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	685	output[18] = step[9];
				686	output[10] = step[10];
				687	output[26] = step[11];
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	688	output[6] = step[12];
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	689	output[22] = step[13];
				690	output[14] = step[14];
				691	output[30] = step[15];
				692
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	693	output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	694	output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	695	output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	696	output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	697	output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	698	output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
				699	output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
				700	output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	701	output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	702	output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
				703	output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
				704	output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	705	output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	706	output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
				707	output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
				708	output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
				709	}
				710
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	711	void aom_fdct32x32_c(const int16_t input, tran_low_t out, int stride) {
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	712	int i, j;
				713	tran_high_t output[32 * 32];
				714
				715	// Columns
				716	for (i = 0; i < 32; ++i) {
				717	tran_high_t temp_in[32], temp_out[32];
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	718	for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	719	aom_fdct32(temp_in, temp_out, 0);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	720	for (j = 0; j < 32; ++j)
				721	output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
				722	}
				723
				724	// Rows
				725	for (i = 0; i < 32; ++i) {
				726	tran_high_t temp_in[32], temp_out[32];
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	727	for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	728	aom_fdct32(temp_in, temp_out, 0);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	729	for (j = 0; j < 32; ++j)
				730	out[j + i * 32] =
				731	(tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
				732	}
				733	}
				734
				735	// Note that although we use dct_32_round in dct32 computation flow,
				736	// this 2d fdct32x32 for rate-distortion optimization loop is operating
				737	// within 16 bits precision.
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	738	void aom_fdct32x32_rd_c(const int16_t input, tran_low_t out, int stride) {
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	739	int i, j;
				740	tran_high_t output[32 * 32];
				741
				742	// Columns
				743	for (i = 0; i < 32; ++i) {
				744	tran_high_t temp_in[32], temp_out[32];
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	745	for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	746	aom_fdct32(temp_in, temp_out, 0);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	747	for (j = 0; j < 32; ++j)
				748	// TODO(cd): see quality impact of only doing
				749	// output[j * 32 + i] = (temp_out[j] + 1) >> 2;
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	750	// PS: also change code in aom_dsp/x86/aom_dct_sse2.c
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	751	output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
				752	}
				753
				754	// Rows
				755	for (i = 0; i < 32; ++i) {
				756	tran_high_t temp_in[32], temp_out[32];
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	757	for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	758	aom_fdct32(temp_in, temp_out, 1);
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	759	for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	760	}
				761	}
				762
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	763	void aom_fdct32x32_1_c(const int16_t input, tran_low_t output, int stride) {
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	764	int r, c;
				765	tran_low_t sum = 0;
				766	for (r = 0; r < 32; ++r)
clang-format	99e28b8	2016-01-27 12:42:45 -0800	[diff] [blame]	767	for (c = 0; c < 32; ++c) sum += input[r * stride + c];
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	768
				769	output[0] = sum >> 3;
				770	output[1] = 0;
				771	}
				772
Yaowu Xu	01dee0b	2016-03-25 12:43:01 -0700	[diff] [blame]	773	#if CONFIG_AOM_HIGHBITDEPTH
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	774	void aom_highbd_fdct4x4_c(const int16_t input, tran_low_t output,
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	775	int stride) {
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	776	aom_fdct4x4_c(input, output, stride);
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	777	}
				778
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	779	void aom_highbd_fdct8x8_c(const int16_t input, tran_low_t final_output,
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	780	int stride) {
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	781	aom_fdct8x8_c(input, final_output, stride);
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	782	}
				783
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	784	void aom_highbd_fdct8x8_1_c(const int16_t input, tran_low_t final_output,
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	785	int stride) {
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	786	aom_fdct8x8_1_c(input, final_output, stride);
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	787	}
				788
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	789	void aom_highbd_fdct16x16_c(const int16_t input, tran_low_t output,
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	790	int stride) {
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	791	aom_fdct16x16_c(input, output, stride);
Jingning Han	b67821f	2015-07-21 11:56:36 -0700	[diff] [blame]	792	}
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	793
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	794	void aom_highbd_fdct16x16_1_c(const int16_t input, tran_low_t output,
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	795	int stride) {
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	796	aom_fdct16x16_1_c(input, output, stride);
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	797	}
				798
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	799	void aom_highbd_fdct32x32_c(const int16_t input, tran_low_t out, int stride) {
				800	aom_fdct32x32_c(input, out, stride);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	801	}
				802
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	803	void aom_highbd_fdct32x32_rd_c(const int16_t input, tran_low_t out,
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	804	int stride) {
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	805	aom_fdct32x32_rd_c(input, out, stride);
Jingning Han	a6a4659	2015-07-27 16:05:15 -0700	[diff] [blame]	806	}
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	807
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	808	void aom_highbd_fdct32x32_1_c(const int16_t input, tran_low_t out,
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	809	int stride) {
Adrian Grange	cebe6f0	2016-03-25 12:11:05 -0700	[diff] [blame]	810	aom_fdct32x32_1_c(input, out, stride);
Jingning Han	d19033f	2015-07-28 14:42:25 -0700	[diff] [blame]	811	}
Yaowu Xu	01dee0b	2016-03-25 12:43:01 -0700	[diff] [blame]	812	#endif // CONFIG_AOM_HIGHBITDEPTH