aom_dsp/fft.c - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/fft_common.h"

 static INLINE void simple_transpose(const float *A, float *B, int n) {
   for (int y = 0; y < n; y++) {
     for (int x = 0; x < n; x++) {
       B[y * n + x] = A[x * n + y];
     }
   }
 }

 // The 1d transform is real to complex and packs the complex results in
 // a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
 // components, followed by the n/2 - 1 imaginary components). After the
 // transform is done on the rows, the first n/2 + 1 columns are real, and
 // the remaining are the imaginary components. After the transform on the
 // columns, the region of [0, n/2]x[0, n/2] contains the real part of
 // fft of the real columns. The real part of the 2d fft also includes the
 // imaginary part of transformed imaginary columns. This function assembles
 // the correct outputs while putting the real and imaginary components
 // next to each other.
 static INLINE void unpack_2d_output(const float *col_fft, float *output,
                                     int n) {
   for (int y = 0; y <= n / 2; ++y) {
     const int y2 = y + n / 2;
     const int y_extra = y2 > n / 2 && y2 < n;

     for (int x = 0; x <= n / 2; ++x) {
       const int x2 = x + n / 2;
       const int x_extra = x2 > n / 2 && x2 < n;
       output[2 * (y * n + x)] =
           col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
       output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
                                     (x_extra ? col_fft[y * n + x2] : 0);
       if (y_extra) {
         output[2 * ((n - y) * n + x)] =
             col_fft[y * n + x] +
             (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
         output[2 * ((n - y) * n + x) + 1] =
             -(y_extra ? col_fft[y2 * n + x] : 0) +
             (x_extra ? col_fft[y * n + x2] : 0);
       }
     }
   }
 }

 void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
                     aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
                     aom_fft_unpack_func_t unpack, int vec_size) {
   for (int x = 0; x < n; x += vec_size) {
     tform(input + x, output + x, n);
   }
   transpose(output, temp, n);

   for (int x = 0; x < n; x += vec_size) {
     tform(temp + x, output + x, n);
   }
   transpose(output, temp, n);

   unpack(temp, output, n);
 }

 static INLINE void store_float(float *output, float input) { *output = input; }
 static INLINE float add_float(float a, float b) { return a + b; }
 static INLINE float sub_float(float a, float b) { return a - b; }
 static INLINE float mul_float(float a, float b) { return a * b; }

 GEN_FFT_2(void, float, float, float, *, store_float);
 GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
           sub_float);
 GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
           sub_float, mul_float);
 GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
            sub_float, mul_float);
 GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
            sub_float, mul_float);

 void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
   aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
                  unpack_2d_output, 1);
 }

 void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
   aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
                  unpack_2d_output, 1);
 }

 void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
   aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
                  unpack_2d_output, 1);
 }

 void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
   aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
                  unpack_2d_output, 1);
 }

 void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
   aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
                  unpack_2d_output, 1);
 }

 void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
                      aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
                      aom_fft_1d_func_t ifft_multi,
                      aom_fft_transpose_func_t transpose, int vec_size) {
   // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
   // and get real outputs.
   for (int y = 0; y <= n / 2; ++y) {
     output[y * n] = input[2 * y * n];
     output[y * n + 1] = input[2 * (y * n + n / 2)];
   }
   for (int y = n / 2 + 1; y < n; ++y) {
     output[y * n] = input[2 * (y - n / 2) * n + 1];
     output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
   }

   for (int i = 0; i < 2; i += vec_size) {
     ifft_multi(output + i, temp + i, n);
   }

   // For the other columns, since we don't have a full ifft for complex inputs
   // we have to split them into the real and imaginary counterparts.
   // Pack the real component, then the imaginary components.
   for (int y = 0; y < n; ++y) {
     for (int x = 1; x < n / 2; ++x) {
       output[y * n + (x + 1)] = input[2 * (y * n + x)];
     }
     for (int x = 1; x < n / 2; ++x) {
       output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
     }
   }
   for (int y = 2; y < vec_size; y++) {
     fft_single(output + y, temp + y, n);
   }
   // This is the part that can be sped up with SIMD
   for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
     fft_multi(output + y, temp + y, n);
   }

   // Put the 0 and n/2 th results in the correct place.
   for (int x = 0; x < n; ++x) {
     output[x] = temp[x * n];
     output[(n / 2) * n + x] = temp[x * n + 1];
   }
   // This rearranges and transposes.
   for (int y = 1; y < n / 2; ++y) {
     // Fill in the real columns
     for (int x = 0; x <= n / 2; ++x) {
       output[x + y * n] =
           temp[(y + 1) + x * n] +
           ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
     }
     for (int x = n / 2 + 1; x < n; ++x) {
       output[x + y * n] = temp[(y + 1) + (n - x) * n] -
                           temp[(y + n / 2) + ((n - x) + n / 2) * n];
     }
     // Fill in the imag columns
     for (int x = 0; x <= n / 2; ++x) {
       output[x + (y + n / 2) * n] =
           temp[(y + n / 2) + x * n] -
           ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
     }
     for (int x = n / 2 + 1; x < n; ++x) {
       output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
                                     temp[(y + n / 2) + (n - x) * n];
     }
   }
   for (int y = 0; y < n; y += vec_size) {
     ifft_multi(output + y, temp + y, n);
   }
   transpose(temp, output, n);
 }

 GEN_IFFT_2(void, float, float, float, *, store_float);
 GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
            sub_float);
 GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
            sub_float, mul_float);
 GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
             sub_float, mul_float);
 GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
             sub_float, mul_float);

 void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
   aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
                   aom_ifft1d_2_float, simple_transpose, 1);
 }

 void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
   aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
                   aom_ifft1d_4_float, simple_transpose, 1);
 }

 void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
   aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
                   aom_ifft1d_8_float, simple_transpose, 1);
 }

 void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
   aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
                   aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
 }

 void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
   aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
                   aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
 }
	/*
	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/

	#include "aom_dsp/aom_dsp_common.h"
	#include "aom_dsp/fft_common.h"

	static INLINE void simple_transpose(const float A, float B, int n) {
	for (int y = 0; y < n; y++) {
	for (int x = 0; x < n; x++) {
	B[y * n + x] = A[x * n + y];
	}
	}
	}

	// The 1d transform is real to complex and packs the complex results in
	// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
	// components, followed by the n/2 - 1 imaginary components). After the
	// transform is done on the rows, the first n/2 + 1 columns are real, and
	// the remaining are the imaginary components. After the transform on the
	// columns, the region of [0, n/2]x[0, n/2] contains the real part of
	// fft of the real columns. The real part of the 2d fft also includes the
	// imaginary part of transformed imaginary columns. This function assembles
	// the correct outputs while putting the real and imaginary components
	// next to each other.
	static INLINE void unpack_2d_output(const float col_fft, float output,
	int n) {
	for (int y = 0; y <= n / 2; ++y) {
	const int y2 = y + n / 2;
	const int y_extra = y2 > n / 2 && y2 < n;

	for (int x = 0; x <= n / 2; ++x) {
	const int x2 = x + n / 2;
	const int x_extra = x2 > n / 2 && x2 < n;
	output[2 * (y * n + x)] =
	col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
	output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
	(x_extra ? col_fft[y * n + x2] : 0);
	if (y_extra) {
	output[2 * ((n - y) * n + x)] =
	col_fft[y * n + x] +
	(x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
	output[2 * ((n - y) * n + x) + 1] =
	-(y_extra ? col_fft[y2 * n + x] : 0) +
	(x_extra ? col_fft[y * n + x2] : 0);
	}
	}
	}
	}

	void aom_fft_2d_gen(const float input, float temp, float *output, int n,
	aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
	aom_fft_unpack_func_t unpack, int vec_size) {
	for (int x = 0; x < n; x += vec_size) {
	tform(input + x, output + x, n);
	}
	transpose(output, temp, n);

	for (int x = 0; x < n; x += vec_size) {
	tform(temp + x, output + x, n);
	}
	transpose(output, temp, n);

	unpack(temp, output, n);
	}

	static INLINE void store_float(float output, float input) { output = input; }
	static INLINE float add_float(float a, float b) { return a + b; }
	static INLINE float sub_float(float a, float b) { return a - b; }
	static INLINE float mul_float(float a, float b) { return a * b; }

	GEN_FFT_2(void, float, float, float, *, store_float);
	GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
	sub_float);
	GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
	sub_float, mul_float);
	GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
	sub_float, mul_float);
	GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
	sub_float, mul_float);

	void aom_fft2x2_float_c(const float input, float temp, float *output) {
	aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
	unpack_2d_output, 1);
	}

	void aom_fft4x4_float_c(const float input, float temp, float *output) {
	aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
	unpack_2d_output, 1);
	}

	void aom_fft8x8_float_c(const float input, float temp, float *output) {
	aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
	unpack_2d_output, 1);
	}

	void aom_fft16x16_float_c(const float input, float temp, float *output) {
	aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
	unpack_2d_output, 1);
	}

	void aom_fft32x32_float_c(const float input, float temp, float *output) {
	aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
	unpack_2d_output, 1);
	}

	void aom_ifft_2d_gen(const float input, float temp, float *output, int n,
	aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
	aom_fft_1d_func_t ifft_multi,
	aom_fft_transpose_func_t transpose, int vec_size) {
	// Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
	// and get real outputs.
	for (int y = 0; y <= n / 2; ++y) {
	output[y * n] = input[2 * y * n];
	output[y * n + 1] = input[2 * (y * n + n / 2)];
	}
	for (int y = n / 2 + 1; y < n; ++y) {
	output[y * n] = input[2 * (y - n / 2) * n + 1];
	output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
	}

	for (int i = 0; i < 2; i += vec_size) {
	ifft_multi(output + i, temp + i, n);
	}

	// For the other columns, since we don't have a full ifft for complex inputs
	// we have to split them into the real and imaginary counterparts.
	// Pack the real component, then the imaginary components.
	for (int y = 0; y < n; ++y) {
	for (int x = 1; x < n / 2; ++x) {
	output[y * n + (x + 1)] = input[2 * (y * n + x)];
	}
	for (int x = 1; x < n / 2; ++x) {
	output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
	}
	}
	for (int y = 2; y < vec_size; y++) {
	fft_single(output + y, temp + y, n);
	}
	// This is the part that can be sped up with SIMD
	for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
	fft_multi(output + y, temp + y, n);
	}

	// Put the 0 and n/2 th results in the correct place.
	for (int x = 0; x < n; ++x) {
	output[x] = temp[x * n];
	output[(n / 2) * n + x] = temp[x * n + 1];
	}
	// This rearranges and transposes.
	for (int y = 1; y < n / 2; ++y) {
	// Fill in the real columns
	for (int x = 0; x <= n / 2; ++x) {
	output[x + y * n] =
	temp[(y + 1) + x * n] +
	((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
	}
	for (int x = n / 2 + 1; x < n; ++x) {
	output[x + y * n] = temp[(y + 1) + (n - x) * n] -
	temp[(y + n / 2) + ((n - x) + n / 2) * n];
	}
	// Fill in the imag columns
	for (int x = 0; x <= n / 2; ++x) {
	output[x + (y + n / 2) * n] =
	temp[(y + n / 2) + x * n] -
	((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
	}
	for (int x = n / 2 + 1; x < n; ++x) {
	output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
	temp[(y + n / 2) + (n - x) * n];
	}
	}
	for (int y = 0; y < n; y += vec_size) {
	ifft_multi(output + y, temp + y, n);
	}
	transpose(temp, output, n);
	}

	GEN_IFFT_2(void, float, float, float, *, store_float);
	GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
	sub_float);
	GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
	sub_float, mul_float);
	GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
	sub_float, mul_float);
	GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
	sub_float, mul_float);

	void aom_ifft2x2_float_c(const float input, float temp, float *output) {
	aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
	aom_ifft1d_2_float, simple_transpose, 1);
	}

	void aom_ifft4x4_float_c(const float input, float temp, float *output) {
	aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
	aom_ifft1d_4_float, simple_transpose, 1);
	}

	void aom_ifft8x8_float_c(const float input, float temp, float *output) {
	aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
	aom_ifft1d_8_float, simple_transpose, 1);
	}

	void aom_ifft16x16_float_c(const float input, float temp, float *output) {
	aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
	aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
	}

	void aom_ifft32x32_float_c(const float input, float temp, float *output) {
	aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
	aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
	}