Blame - aom_dsp/fwd_txfm.c - avm

blob: 853e5ec80b94903def5793fc91947d8d58f29dd4 [file] [log] [blame]

Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	1	/*
Krishna Rapaka	7319db5	2021-09-28 20:35:29 -0700	[diff] [blame]	2	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	3	*
Vibhoothi	41c6dd7	2021-10-12 18:48:26 +0000	[diff] [blame]	4	* This source code is subject to the terms of the BSD 3-Clause Clear License
				5	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
				6	* License was not distributed with this source code in the LICENSE file, you
				7	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
				8	* Alliance for Open Media Patent License 1.0 was not distributed with this
				9	* source code in the PATENTS file, you can obtain it at
				10	* aomedia.org/license/patent-license/.
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	11	*/
				12
Urvang Joshi	fdb6096	2016-10-14 15:30:27 -0700	[diff] [blame]	13	#include <assert.h>
Urvang Joshi	698720b	2018-05-09 15:04:31 -0400	[diff] [blame]	14	#include "aom_dsp/txfm_common.h"
Tom Finegan	44702c8	2018-05-22 13:00:39 -0700	[diff] [blame]	15	#include "config/aom_dsp_rtcd.h"
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	16
Jerome Jiang	d4e351d	2020-01-30 15:13:11 -0800	[diff] [blame]	17	void aom_fdct4x4_c(const int16_t input, tran_low_t output, int stride) {
				18	// The 2D transform is done with two passes which are actually pretty
				19	// similar. In the first one, we transform the columns and transpose
				20	// the results. In the second one, we transform the rows. To achieve that,
				21	// as the first pass results are transposed, we transpose the columns (that
				22	// is the transposed rows) and transpose the results (so that it goes back
				23	// in normal/row positions).
				24	// We need an intermediate buffer between passes.
				25	tran_low_t intermediate[4 * 4];
				26	const tran_low_t *in_low = NULL;
				27	tran_low_t *out = intermediate;
				28	// Do the two transform/transpose passes
				29	for (int pass = 0; pass < 2; ++pass) {
				30	tran_high_t in_high[4]; // canbe16
				31	tran_high_t step[4]; // canbe16
				32	tran_high_t temp1, temp2; // needs32
				33	for (int i = 0; i < 4; ++i) {
				34	// Load inputs.
				35	if (pass == 0) {
				36	in_high[0] = input[0 * stride] * 16;
				37	in_high[1] = input[1 * stride] * 16;
				38	in_high[2] = input[2 * stride] * 16;
				39	in_high[3] = input[3 * stride] * 16;
				40	if (i == 0 && in_high[0]) {
				41	++in_high[0];
				42	}
				43	} else {
				44	assert(in_low != NULL);
				45	in_high[0] = in_low[0 * 4];
				46	in_high[1] = in_low[1 * 4];
				47	in_high[2] = in_low[2 * 4];
				48	in_high[3] = in_low[3 * 4];
				49	++in_low;
				50	}
				51	// Transform.
				52	step[0] = in_high[0] + in_high[3];
				53	step[1] = in_high[1] + in_high[2];
				54	step[2] = in_high[1] - in_high[2];
				55	step[3] = in_high[0] - in_high[3];
				56	temp1 = (step[0] + step[1]) * cospi_16_64;
				57	temp2 = (step[0] - step[1]) * cospi_16_64;
				58	out[0] = (tran_low_t)fdct_round_shift(temp1);
				59	out[2] = (tran_low_t)fdct_round_shift(temp2);
				60	temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
				61	temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
				62	out[1] = (tran_low_t)fdct_round_shift(temp1);
				63	out[3] = (tran_low_t)fdct_round_shift(temp2);
				64	// Do next column (which is a transposed row in second/horizontal pass)
				65	++input;
				66	out += 4;
				67	}
				68	// Setup in/out for next pass.
				69	in_low = intermediate;
				70	out = output;
				71	}
				72
				73	for (int i = 0; i < 4; ++i) {
				74	for (int j = 0; j < 4; ++j)
				75	output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
				76	}
				77	}
				78
				79	void aom_fdct4x4_lp_c(const int16_t input, int16_t output, int stride) {
				80	// The 2D transform is done with two passes which are actually pretty
				81	// similar. In the first one, we transform the columns and transpose
				82	// the results. In the second one, we transform the rows. To achieve that,
				83	// as the first pass results are transposed, we transpose the columns (that
				84	// is the transposed rows) and transpose the results (so that it goes back
				85	// in normal/row positions).
				86	// We need an intermediate buffer between passes.
				87	int16_t intermediate[4 * 4];
				88	const int16_t *in_low = NULL;
				89	int16_t *out = intermediate;
				90	// Do the two transform/transpose passes
				91	for (int pass = 0; pass < 2; ++pass) {
				92	int32_t in_high[4]; // canbe16
				93	int32_t step[4]; // canbe16
				94	int32_t temp1, temp2; // needs32
				95	for (int i = 0; i < 4; ++i) {
				96	// Load inputs.
				97	if (pass == 0) {
				98	in_high[0] = input[0 * stride] * 16;
				99	in_high[1] = input[1 * stride] * 16;
				100	in_high[2] = input[2 * stride] * 16;
				101	in_high[3] = input[3 * stride] * 16;
				102	if (i == 0 && in_high[0]) {
				103	++in_high[0];
				104	}
				105	} else {
				106	assert(in_low != NULL);
				107	in_high[0] = in_low[0 * 4];
				108	in_high[1] = in_low[1 * 4];
				109	in_high[2] = in_low[2 * 4];
				110	in_high[3] = in_low[3 * 4];
				111	++in_low;
				112	}
				113	// Transform.
				114	step[0] = in_high[0] + in_high[3];
				115	step[1] = in_high[1] + in_high[2];
				116	step[2] = in_high[1] - in_high[2];
				117	step[3] = in_high[0] - in_high[3];
Jerome Jiang	89b884c	2020-02-04 11:18:31 -0800	[diff] [blame]	118	temp1 = (step[0] + step[1]) * (int32_t)cospi_16_64;
				119	temp2 = (step[0] - step[1]) * (int32_t)cospi_16_64;
Jerome Jiang	d4e351d	2020-01-30 15:13:11 -0800	[diff] [blame]	120	out[0] = (int16_t)fdct_round_shift(temp1);
				121	out[2] = (int16_t)fdct_round_shift(temp2);
Jerome Jiang	89b884c	2020-02-04 11:18:31 -0800	[diff] [blame]	122	temp1 = step[2] * (int32_t)cospi_24_64 + step[3] * (int32_t)cospi_8_64;
				123	temp2 = -step[2] * (int32_t)cospi_8_64 + step[3] * (int32_t)cospi_24_64;
Jerome Jiang	d4e351d	2020-01-30 15:13:11 -0800	[diff] [blame]	124	out[1] = (int16_t)fdct_round_shift(temp1);
				125	out[3] = (int16_t)fdct_round_shift(temp2);
				126	// Do next column (which is a transposed row in second/horizontal pass)
				127	++input;
				128	out += 4;
				129	}
				130	// Setup in/out for next pass.
				131	in_low = intermediate;
				132	out = output;
				133	}
				134
				135	for (int i = 0; i < 4; ++i) {
				136	for (int j = 0; j < 4; ++j)
				137	output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
				138	}
				139	}
				140
James Almer	857e93f	2022-05-25 16:44:43 +0000	[diff] [blame]	141	void aom_highbd_fdct8x8_c(const int16_t input, tran_low_t final_output,
				142	int stride) {
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	143	int i, j;
				144	tran_low_t intermediate[64];
				145	int pass;
				146	tran_low_t *output = intermediate;
				147	const tran_low_t *in = NULL;
				148
				149	// Transform columns
				150	for (pass = 0; pass < 2; ++pass) {
				151	tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
				152	tran_high_t t0, t1, t2, t3; // needs32
				153	tran_high_t x0, x1, x2, x3; // canbe16
				154
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	155	for (i = 0; i < 8; i++) {
				156	// stage 1
				157	if (pass == 0) {
				158	s0 = (input[0 * stride] + input[7 * stride]) * 4;
				159	s1 = (input[1 * stride] + input[6 * stride]) * 4;
				160	s2 = (input[2 * stride] + input[5 * stride]) * 4;
				161	s3 = (input[3 * stride] + input[4 * stride]) * 4;
				162	s4 = (input[3 * stride] - input[4 * stride]) * 4;
				163	s5 = (input[2 * stride] - input[5 * stride]) * 4;
				164	s6 = (input[1 * stride] - input[6 * stride]) * 4;
				165	s7 = (input[0 * stride] - input[7 * stride]) * 4;
				166	++input;
				167	} else {
				168	s0 = in[0 * 8] + in[7 * 8];
				169	s1 = in[1 * 8] + in[6 * 8];
				170	s2 = in[2 * 8] + in[5 * 8];
				171	s3 = in[3 * 8] + in[4 * 8];
				172	s4 = in[3 * 8] - in[4 * 8];
				173	s5 = in[2 * 8] - in[5 * 8];
				174	s6 = in[1 * 8] - in[6 * 8];
				175	s7 = in[0 * 8] - in[7 * 8];
				176	++in;
				177	}
				178
				179	// fdct4(step, step);
				180	x0 = s0 + s3;
				181	x1 = s1 + s2;
				182	x2 = s1 - s2;
				183	x3 = s0 - s3;
				184	t0 = (x0 + x1) * cospi_16_64;
				185	t1 = (x0 - x1) * cospi_16_64;
				186	t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
				187	t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
				188	output[0] = (tran_low_t)fdct_round_shift(t0);
				189	output[2] = (tran_low_t)fdct_round_shift(t2);
				190	output[4] = (tran_low_t)fdct_round_shift(t1);
				191	output[6] = (tran_low_t)fdct_round_shift(t3);
				192
				193	// Stage 2
				194	t0 = (s6 - s5) * cospi_16_64;
				195	t1 = (s6 + s5) * cospi_16_64;
				196	t2 = fdct_round_shift(t0);
				197	t3 = fdct_round_shift(t1);
				198
				199	// Stage 3
				200	x0 = s4 + t2;
				201	x1 = s4 - t2;
				202	x2 = s7 - t3;
				203	x3 = s7 + t3;
				204
				205	// Stage 4
				206	t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
				207	t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
				208	t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
				209	t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
				210	output[1] = (tran_low_t)fdct_round_shift(t0);
				211	output[3] = (tran_low_t)fdct_round_shift(t2);
				212	output[5] = (tran_low_t)fdct_round_shift(t1);
				213	output[7] = (tran_low_t)fdct_round_shift(t3);
				214	output += 8;
				215	}
				216	in = intermediate;
				217	output = final_output;
				218	}
				219
				220	// Rows
				221	for (i = 0; i < 8; ++i) {
				222	for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
				223	}
				224	}