Add 2d wiener denoiser for regrainer

This also adds support for 2d fft (float) to be used
for both denoising and noise power spectral density estimation.

Change-Id: I525d0712235b566d1004aa8b6d0ad0d81eebca67
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 7b04398..47fbd66 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -26,6 +26,8 @@
             "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
             "${AOM_ROOT}/aom_dsp/entcode.c"
             "${AOM_ROOT}/aom_dsp/entcode.h"
+            "${AOM_ROOT}/aom_dsp/fft.c"
+            "${AOM_ROOT}/aom_dsp/fft_common.h"
             "${AOM_ROOT}/aom_dsp/intrapred.c"
             "${AOM_ROOT}/aom_dsp/intrapred_common.h"
             "${AOM_ROOT}/aom_dsp/loopfilter.c"
@@ -54,6 +56,7 @@
             "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
             "${AOM_ROOT}/aom_dsp/x86/convolve.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
@@ -81,6 +84,7 @@
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c")
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a8c4975..9ba5496 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -472,6 +472,34 @@
     add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_highbd_fdct8x8 sse2/;
 
+    # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+    add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
+
+    add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft4x4_float                  sse2/;
+
+    add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft8x8_float avx2             sse2/;
+
+    add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft16x16_float avx2           sse2/;
+
+    add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft32x32_float avx2           sse2/;
+
+    add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
+
+    add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft4x4_float                 sse2/;
+
+    add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft8x8_float avx2            sse2/;
+
+    add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft16x16_float avx2          sse2/;
+
+    add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft32x32_float avx2          sse2/;
 }  # CONFIG_AV1_ENCODER
 
 #
diff --git a/aom_dsp/fft.c b/aom_dsp/fft.c
new file mode 100644
index 0000000..99ddfc3
--- /dev/null
+++ b/aom_dsp/fft.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y++) {
+    for (int x = 0; x < n; x++) {
+      B[y * n + x] = A[x * n + y];
+    }
+  }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+                                    int n) {
+  for (int y = 0; y <= n / 2; ++y) {
+    const int y2 = y + n / 2;
+    const int y_extra = y2 > n / 2 && y2 < n;
+
+    for (int x = 0; x <= n / 2; ++x) {
+      const int x2 = x + n / 2;
+      const int x_extra = x2 > n / 2 && x2 < n;
+      output[2 * (y * n + x)] =
+          col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+      output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+                                    (x_extra ? col_fft[y * n + x2] : 0);
+      if (y_extra) {
+        output[2 * ((n - y) * n + x)] =
+            col_fft[y * n + x] +
+            (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+        output[2 * ((n - y) * n + x) + 1] =
+            -(y_extra ? col_fft[y2 * n + x] : 0) +
+            (x_extra ? col_fft[y * n + x2] : 0);
+      }
+    }
+  }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size) {
+  for (int x = 0; x < n; x += vec_size) {
+    tform(input + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  for (int x = 0; x < n; x += vec_size) {
+    tform(temp + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float float_constant(float input) { return input; }
+
+GEN_FFT_2(void, float, float, float, *, store_float);
+GEN_FFT_4(void, float, float, float, *, store_float);
+GEN_FFT_8(void, float, float, float, *, store_float, float_constant);
+GEN_FFT_16(void, float, float, float, *, store_float, float_constant);
+GEN_FFT_32(void, float, float, float, *, store_float, float_constant);
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size) {
+  // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+  // and get real outputs.
+  for (int y = 0; y <= n / 2; ++y) {
+    output[y * n] = input[2 * y * n];
+    output[y * n + 1] = input[2 * (y * n + n / 2)];
+  }
+  for (int y = n / 2 + 1; y < n; ++y) {
+    output[y * n] = input[2 * (y - n / 2) * n + 1];
+    output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+  }
+
+  for (int i = 0; i < 2; i += vec_size) {
+    ifft_multi(output + i, temp + i, n);
+  }
+
+  // For the other columns, since we don't have a full ifft for complex inputs
+  // we have to split them into the real and imaginary counterparts.
+  // Pack the real component, then the imaginary components.
+  for (int y = 0; y < n; ++y) {
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + 1)] = input[2 * (y * n + x)];
+    }
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+    }
+  }
+  for (int y = 2; y < vec_size; y++) {
+    fft_single(output + y, temp + y, n);
+  }
+  // This is the part that can be sped up with SIMD
+  for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+    fft_multi(output + y, temp + y, n);
+  }
+
+  // Put the 0 and n/2 th results in the correct place.
+  for (int x = 0; x < n; ++x) {
+    output[x] = temp[x * n];
+    output[(n / 2) * n + x] = temp[x * n + 1];
+  }
+  // This rearranges and transposes.
+  for (int y = 1; y < n / 2; ++y) {
+    // Fill in the real columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + y * n] =
+          temp[(y + 1) + x * n] +
+          ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+                          temp[(y + n / 2) + ((n - x) + n / 2) * n];
+    }
+    // Fill in the imag columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + (y + n / 2) * n] =
+          temp[(y + n / 2) + x * n] -
+          ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+                                    temp[(y + n / 2) + (n - x) * n];
+    }
+  }
+  for (int y = 0; y < n; y += vec_size) {
+    ifft_multi(output + y, temp + y, n);
+  }
+  transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float);
+GEN_IFFT_4(void, float, float, float, *, store_float);
+GEN_IFFT_8(void, float, float, float, *, store_float, float_constant);
+GEN_IFFT_16(void, float, float, float, *, store_float, float_constant);
+GEN_IFFT_32(void, float, float, float, *, store_float, float_constant);
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+                  aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+                  aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+                  aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/aom_dsp/fft_common.h b/aom_dsp/fft_common.h
new file mode 100644
index 0000000..3b1f495
--- /dev/null
+++ b/aom_dsp/fft_common.h
@@ -0,0 +1,1100 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_FFT_COMMON_H_
+#define AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in]  input   Input buffer. See above for size restrictions.
+ * \param[out] output  Output buffer. See above for size restrictions.
+ * \param[in]  stride  The spacing in number of elements between rows
+ *                     (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+                                  int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+                                         int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in]  input     Input buffer to run the transform on (size n x n)
+ * \param[out] temp      Working buffer for computing the transform (size n x n)
+ * \param[out] output    Output buffer (size 2 x n x n)
+ * \param[in]  tform     Forward transform function
+ * \param[in]  transpose Transpose function (for n x n matrix)
+ * \param[in]  unpack    Unpack function used to massage outputs to correct form
+ * \param[in]  vec_size  Vector size (the transform is done vec_size units at
+ *                       a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp       Working buffer for computations (size 2 x n x n)
+ * \param[out] output     Output buffer (size n x n)
+ * \param[in]  fft_single Forward transform function (non vectorized)
+ * \param[in]  fft_multi  Forward transform function (vectorized)
+ * \param[in]  ifft_multi Inverse transform function (vectorized)
+ * \param[in]  transpose  Transpose function (for n x n matrix)
+ * \param[in]  vec_size   Vector size (the transform is done vec_size
+ *                        units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                      \
+    const T_VEC i1 = load(input + 1 * stride);                      \
+    store(output + 0 * stride, i0 + i1);                            \
+    store(output + 1 * stride, i0 - i1);                            \
+  }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                      \
+    const T_VEC i1 = load(input + 1 * stride);                      \
+    const T_VEC i2 = load(input + 2 * stride);                      \
+    const T_VEC i3 = load(input + 3 * stride);                      \
+    const T_VEC w0 = i0 + i2;                                       \
+    const T_VEC w1 = i0 - i2;                                       \
+    const T_VEC w2 = i1 + i3;                                       \
+    const T_VEC w3 = i1 - i3;                                       \
+    store(output + 0 * stride, w0 + w2);                            \
+    store(output + 1 * stride, w1);                                 \
+    store(output + 2 * stride, w0 - w2);                            \
+    store(output + 3 * stride, -w3);                                \
+  }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant)            \
+  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {        \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC w0 = i0 + i4;                                              \
+    const T_VEC w1 = i0 - i4;                                              \
+    const T_VEC w2 = i2 + i6;                                              \
+    const T_VEC w3 = i2 - i6;                                              \
+    const T_VEC w4 = w0 + w2;                                              \
+    const T_VEC w5 = w0 - w2;                                              \
+    /* w6 = -w3 is not worth a temporary; it is replaced with -w3 below */ \
+    const T_VEC w7 = i1 + i5;                                              \
+    const T_VEC w8 = i1 - i5;                                              \
+    const T_VEC w9 = i3 + i7;                                              \
+    const T_VEC w10 = i3 - i7;                                             \
+    const T_VEC w11 = w7 + w9;                                             \
+    const T_VEC w12 = w7 - w9;                                             \
+    store(output + 0 * stride, w4 + w11);                                  \
+    store(output + 1 * stride, w1 + (kWeight2 * w8 - kWeight2 * w10));     \
+    store(output + 2 * stride, w5);                                        \
+    store(output + 3 * stride, w1 + (-kWeight2 * w8 + kWeight2 * w10));    \
+    store(output + 4 * stride, w4 - w11);                                  \
+    store(output + 5 * stride, -w3 - (kWeight2 * w10 + kWeight2 * w8));    \
+    store(output + 6 * stride, -w12);                                      \
+    store(output + 7 * stride, w3 + (-kWeight2 * w10 - kWeight2 * w8));    \
+  }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant)              \
+  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {          \
+    const T_VEC kWeight2 = constant(0.707107f);                               \
+    const T_VEC kWeight3 = constant(0.92388f);                                \
+    const T_VEC kWeight4 = constant(0.382683f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                                \
+    const T_VEC i1 = load(input + 1 * stride);                                \
+    const T_VEC i2 = load(input + 2 * stride);                                \
+    const T_VEC i3 = load(input + 3 * stride);                                \
+    const T_VEC i4 = load(input + 4 * stride);                                \
+    const T_VEC i5 = load(input + 5 * stride);                                \
+    const T_VEC i6 = load(input + 6 * stride);                                \
+    const T_VEC i7 = load(input + 7 * stride);                                \
+    const T_VEC i8 = load(input + 8 * stride);                                \
+    const T_VEC i9 = load(input + 9 * stride);                                \
+    const T_VEC i10 = load(input + 10 * stride);                              \
+    const T_VEC i11 = load(input + 11 * stride);                              \
+    const T_VEC i12 = load(input + 12 * stride);                              \
+    const T_VEC i13 = load(input + 13 * stride);                              \
+    const T_VEC i14 = load(input + 14 * stride);                              \
+    const T_VEC i15 = load(input + 15 * stride);                              \
+    const T_VEC w0 = i0 + i8;                                                 \
+    const T_VEC w1 = i0 - i8;                                                 \
+    const T_VEC w2 = i4 + i12;                                                \
+    const T_VEC w3 = i4 - i12;                                                \
+    const T_VEC w4 = w0 + w2;                                                 \
+    const T_VEC w5 = w0 - w2;                                                 \
+    const T_VEC w7 = i2 + i10;                                                \
+    const T_VEC w8 = i2 - i10;                                                \
+    const T_VEC w9 = i6 + i14;                                                \
+    const T_VEC w10 = i6 - i14;                                               \
+    const T_VEC w11 = w7 + w9;                                                \
+    const T_VEC w12 = w7 - w9;                                                \
+    const T_VEC w14 = w4 + w11;                                               \
+    const T_VEC w15 = w4 - w11;                                               \
+    const T_VEC w16[2] = { (T_VEC)(w1 + (kWeight2 * w8 - kWeight2 * w10)),    \
+                           (T_VEC)(-w3 - (kWeight2 * w10 + kWeight2 * w8)) }; \
+    const T_VEC w18[2] = { (T_VEC)(w1 + (-kWeight2 * w8 + kWeight2 * w10)),   \
+                           (T_VEC)(w3 + (-kWeight2 * w10 - kWeight2 * w8)) }; \
+    const T_VEC w19 = i1 + i9;                                                \
+    const T_VEC w20 = i1 - i9;                                                \
+    const T_VEC w21 = i5 + i13;                                               \
+    const T_VEC w22 = i5 - i13;                                               \
+    const T_VEC w23 = w19 + w21;                                              \
+    const T_VEC w24 = w19 - w21;                                              \
+    const T_VEC w26 = i3 + i11;                                               \
+    const T_VEC w27 = i3 - i11;                                               \
+    const T_VEC w28 = i7 + i15;                                               \
+    const T_VEC w29 = i7 - i15;                                               \
+    const T_VEC w30 = w26 + w28;                                              \
+    const T_VEC w31 = w26 - w28;                                              \
+    const T_VEC w33 = w23 + w30;                                              \
+    const T_VEC w34 = w23 - w30;                                              \
+    const T_VEC w35[2] = { (T_VEC)(w20 + (kWeight2 * w27 - kWeight2 * w29)),  \
+                           (T_VEC)(-w22 +                                     \
+                                   -(kWeight2 * w29 + kWeight2 * w27)) };     \
+    const T_VEC w37[2] = { (T_VEC)(w20 + (-kWeight2 * w27 + kWeight2 * w29)), \
+                           (T_VEC)(w22 +                                      \
+                                   (-kWeight2 * w29 - kWeight2 * w27)) };     \
+    store(output + 0 * stride, w14 + w33);                                    \
+    store(output + 1 * stride,                                                \
+          w16[0] + (kWeight3 * w35[0] + kWeight4 * w35[1]));                  \
+    store(output + 2 * stride, w5 + (kWeight2 * w24 - kWeight2 * w31));       \
+    store(output + 3 * stride,                                                \
+          w18[0] + (kWeight4 * w37[0] + kWeight3 * w37[1]));                  \
+    store(output + 4 * stride, w15);                                          \
+    store(output + 5 * stride,                                                \
+          w18[0] + (-kWeight4 * w37[0] - kWeight3 * w37[1]));                 \
+    store(output + 6 * stride, w5 + (-kWeight2 * w24 + kWeight2 * w31));      \
+    store(output + 7 * stride,                                                \
+          w16[0] + (-kWeight3 * w35[0] - kWeight4 * w35[1]));                 \
+    store(output + 8 * stride, w14 - w33);                                    \
+    store(output + 9 * stride,                                                \
+          w16[1] + (kWeight3 * w35[1] - kWeight4 * w35[0]));                  \
+    store(output + 10 * stride, -w12 - (kWeight2 * w31 + kWeight2 * w24));    \
+    store(output + 11 * stride,                                               \
+          w18[1] + (kWeight4 * w37[1] - kWeight3 * w37[0]));                  \
+    store(output + 12 * stride, -w34);                                        \
+    store(output + 13 * stride,                                               \
+          -w18[1] - (-kWeight4 * w37[1] + kWeight3 * w37[0]));                \
+    store(output + 14 * stride, w12 + (-kWeight2 * w31 - kWeight2 * w24));    \
+    store(output + 15 * stride,                                               \
+          -w16[1] - (-kWeight3 * w35[1] + kWeight4 * w35[0]));                \
+  }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant)              \
+  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {          \
+    const T_VEC kWeight2 = constant(0.707107f);                               \
+    const T_VEC kWeight3 = constant(0.92388f);                                \
+    const T_VEC kWeight4 = constant(0.382683f);                               \
+    const T_VEC kWeight5 = constant(0.980785f);                               \
+    const T_VEC kWeight6 = constant(0.19509f);                                \
+    const T_VEC kWeight7 = constant(0.83147f);                                \
+    const T_VEC kWeight8 = constant(0.55557f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                                \
+    const T_VEC i1 = load(input + 1 * stride);                                \
+    const T_VEC i2 = load(input + 2 * stride);                                \
+    const T_VEC i3 = load(input + 3 * stride);                                \
+    const T_VEC i4 = load(input + 4 * stride);                                \
+    const T_VEC i5 = load(input + 5 * stride);                                \
+    const T_VEC i6 = load(input + 6 * stride);                                \
+    const T_VEC i7 = load(input + 7 * stride);                                \
+    const T_VEC i8 = load(input + 8 * stride);                                \
+    const T_VEC i9 = load(input + 9 * stride);                                \
+    const T_VEC i10 = load(input + 10 * stride);                              \
+    const T_VEC i11 = load(input + 11 * stride);                              \
+    const T_VEC i12 = load(input + 12 * stride);                              \
+    const T_VEC i13 = load(input + 13 * stride);                              \
+    const T_VEC i14 = load(input + 14 * stride);                              \
+    const T_VEC i15 = load(input + 15 * stride);                              \
+    const T_VEC i16 = load(input + 16 * stride);                              \
+    const T_VEC i17 = load(input + 17 * stride);                              \
+    const T_VEC i18 = load(input + 18 * stride);                              \
+    const T_VEC i19 = load(input + 19 * stride);                              \
+    const T_VEC i20 = load(input + 20 * stride);                              \
+    const T_VEC i21 = load(input + 21 * stride);                              \
+    const T_VEC i22 = load(input + 22 * stride);                              \
+    const T_VEC i23 = load(input + 23 * stride);                              \
+    const T_VEC i24 = load(input + 24 * stride);                              \
+    const T_VEC i25 = load(input + 25 * stride);                              \
+    const T_VEC i26 = load(input + 26 * stride);                              \
+    const T_VEC i27 = load(input + 27 * stride);                              \
+    const T_VEC i28 = load(input + 28 * stride);                              \
+    const T_VEC i29 = load(input + 29 * stride);                              \
+    const T_VEC i30 = load(input + 30 * stride);                              \
+    const T_VEC i31 = load(input + 31 * stride);                              \
+    const T_VEC w0 = i0 + i16;                                                \
+    const T_VEC w1 = i0 - i16;                                                \
+    const T_VEC w2 = i8 + i24;                                                \
+    const T_VEC w3 = i8 - i24;                                                \
+    const T_VEC w4 = w0 + w2;                                                 \
+    const T_VEC w5 = w0 - w2;                                                 \
+    const T_VEC w7 = i4 + i20;                                                \
+    const T_VEC w8 = i4 - i20;                                                \
+    const T_VEC w9 = i12 + i28;                                               \
+    const T_VEC w10 = i12 - i28;                                              \
+    const T_VEC w11 = w7 + w9;                                                \
+    const T_VEC w12 = w7 - w9;                                                \
+    const T_VEC w14 = w4 + w11;                                               \
+    const T_VEC w15 = w4 - w11;                                               \
+    const T_VEC w16[2] = { (T_VEC)(w1 + (kWeight2 * w8 - kWeight2 * w10)),    \
+                           (T_VEC)(-w3 - (kWeight2 * w10 + kWeight2 * w8)) }; \
+    const T_VEC w18[2] = { (T_VEC)(w1 + (-kWeight2 * w8 + kWeight2 * w10)),   \
+                           (T_VEC)(w3 + (-kWeight2 * w10 - kWeight2 * w8)) }; \
+    const T_VEC w19 = i2 + i18;                                               \
+    const T_VEC w20 = i2 - i18;                                               \
+    const T_VEC w21 = i10 + i26;                                              \
+    const T_VEC w22 = i10 - i26;                                              \
+    const T_VEC w23 = w19 + w21;                                              \
+    const T_VEC w24 = w19 - w21;                                              \
+    const T_VEC w26 = i6 + i22;                                               \
+    const T_VEC w27 = i6 - i22;                                               \
+    const T_VEC w28 = i14 + i30;                                              \
+    const T_VEC w29 = i14 - i30;                                              \
+    const T_VEC w30 = w26 + w28;                                              \
+    const T_VEC w31 = w26 - w28;                                              \
+    const T_VEC w33 = w23 + w30;                                              \
+    const T_VEC w34 = w23 - w30;                                              \
+    const T_VEC w35[2] = { (T_VEC)(w20 + (kWeight2 * w27 - kWeight2 * w29)),  \
+                           (T_VEC)(-w22 +                                     \
+                                   -(kWeight2 * w29 + kWeight2 * w27)) };     \
+    const T_VEC w37[2] = { (T_VEC)(w20 + (-kWeight2 * w27 + kWeight2 * w29)), \
+                           (T_VEC)(w22 +                                      \
+                                   (-kWeight2 * w29 - kWeight2 * w27)) };     \
+    const T_VEC w38 = w14 + w33;                                              \
+    const T_VEC w39 = w14 - w33;                                              \
+    const T_VEC w40[2] = {                                                    \
+      (T_VEC)(w16[0] + (kWeight3 * w35[0] + kWeight4 * w35[1])),              \
+      (T_VEC)(w16[1] + (kWeight3 * w35[1] - kWeight4 * w35[0]))               \
+    };                                                                        \
+    const T_VEC w41[2] = { (T_VEC)(w5 + (kWeight2 * w24 - kWeight2 * w31)),   \
+                           (T_VEC)(-w12 +                                     \
+                                   -(kWeight2 * w31 + kWeight2 * w24)) };     \
+    const T_VEC w42[2] = {                                                    \
+      (T_VEC)(w18[0] + (kWeight4 * w37[0] + kWeight3 * w37[1])),              \
+      (T_VEC)(w18[1] + (kWeight4 * w37[1] - kWeight3 * w37[0]))               \
+    };                                                                        \
+    const T_VEC w44[2] = {                                                    \
+      (T_VEC)(w18[0] + (-kWeight4 * w37[0] - kWeight3 * w37[1])),             \
+      (T_VEC)(-w18[1] - (-kWeight4 * w37[1] + kWeight3 * w37[0]))             \
+    };                                                                        \
+    const T_VEC w45[2] = { (T_VEC)(w5 + (-kWeight2 * w24 + kWeight2 * w31)),  \
+                           (T_VEC)(w12 +                                      \
+                                   (-kWeight2 * w31 - kWeight2 * w24)) };     \
+    const T_VEC w46[2] = {                                                    \
+      (T_VEC)(w16[0] + (-kWeight3 * w35[0] - kWeight4 * w35[1])),             \
+      (T_VEC)(-w16[1] - (-kWeight3 * w35[1] + kWeight4 * w35[0]))             \
+    };                                                                        \
+    const T_VEC w47 = i1 + i17;                                               \
+    const T_VEC w48 = i1 - i17;                                               \
+    const T_VEC w49 = i9 + i25;                                               \
+    const T_VEC w50 = i9 - i25;                                               \
+    const T_VEC w51 = w47 + w49;                                              \
+    const T_VEC w52 = w47 - w49;                                              \
+    const T_VEC w54 = i5 + i21;                                               \
+    const T_VEC w55 = i5 - i21;                                               \
+    const T_VEC w56 = i13 + i29;                                              \
+    const T_VEC w57 = i13 - i29;                                              \
+    const T_VEC w58 = w54 + w56;                                              \
+    const T_VEC w59 = w54 - w56;                                              \
+    const T_VEC w61 = w51 + w58;                                              \
+    const T_VEC w62 = w51 - w58;                                              \
+    const T_VEC w63[2] = { (T_VEC)(w48 + (kWeight2 * w55 - kWeight2 * w57)),  \
+                           (T_VEC)(-w50 +                                     \
+                                   -(kWeight2 * w57 + kWeight2 * w55)) };     \
+    const T_VEC w65[2] = { (T_VEC)(w48 + (-kWeight2 * w55 + kWeight2 * w57)), \
+                           (T_VEC)(w50 +                                      \
+                                   (-kWeight2 * w57 - kWeight2 * w55)) };     \
+    const T_VEC w66 = i3 + i19;                                               \
+    const T_VEC w67 = i3 - i19;                                               \
+    const T_VEC w68 = i11 + i27;                                              \
+    const T_VEC w69 = i11 - i27;                                              \
+    const T_VEC w70 = w66 + w68;                                              \
+    const T_VEC w71 = w66 - w68;                                              \
+    const T_VEC w73 = i7 + i23;                                               \
+    const T_VEC w74 = i7 - i23;                                               \
+    const T_VEC w75 = i15 + i31;                                              \
+    const T_VEC w76 = i15 - i31;                                              \
+    const T_VEC w77 = w73 + w75;                                              \
+    const T_VEC w78 = w73 - w75;                                              \
+    const T_VEC w80 = w70 + w77;                                              \
+    const T_VEC w81 = w70 - w77;                                              \
+    const T_VEC w82[2] = { (T_VEC)(w67 + (kWeight2 * w74 - kWeight2 * w76)),  \
+                           (T_VEC)(-w69 +                                     \
+                                   -(kWeight2 * w76 + kWeight2 * w74)) };     \
+    const T_VEC w84[2] = { (T_VEC)(w67 + (-kWeight2 * w74 + kWeight2 * w76)), \
+                           (T_VEC)(w69 +                                      \
+                                   (-kWeight2 * w76 - kWeight2 * w74)) };     \
+    const T_VEC w85 = w61 + w80;                                              \
+    const T_VEC w86 = w61 - w80;                                              \
+    const T_VEC w87[2] = {                                                    \
+      (T_VEC)(w63[0] + (kWeight3 * w82[0] + kWeight4 * w82[1])),              \
+      (T_VEC)(w63[1] + (kWeight3 * w82[1] - kWeight4 * w82[0]))               \
+    };                                                                        \
+    const T_VEC w88[2] = { (T_VEC)(w52 + (kWeight2 * w71 - kWeight2 * w78)),  \
+                           (T_VEC)(-w59 +                                     \
+                                   -(kWeight2 * w78 + kWeight2 * w71)) };     \
+    const T_VEC w89[2] = {                                                    \
+      (T_VEC)(w65[0] + (kWeight4 * w84[0] + kWeight3 * w84[1])),              \
+      (T_VEC)(w65[1] + (kWeight4 * w84[1] - kWeight3 * w84[0]))               \
+    };                                                                        \
+    const T_VEC w91[2] = {                                                    \
+      (T_VEC)(w65[0] + (-kWeight4 * w84[0] - kWeight3 * w84[1])),             \
+      (T_VEC)(-w65[1] - (-kWeight4 * w84[1] + kWeight3 * w84[0]))             \
+    };                                                                        \
+    const T_VEC w92[2] = { (T_VEC)(w52 + (-kWeight2 * w71 + kWeight2 * w78)), \
+                           (T_VEC)(w59 +                                      \
+                                   (-kWeight2 * w78 - kWeight2 * w71)) };     \
+    const T_VEC w93[2] = {                                                    \
+      (T_VEC)(w63[0] + (-kWeight3 * w82[0] - kWeight4 * w82[1])),             \
+      (T_VEC)(-w63[1] - (-kWeight3 * w82[1] + kWeight4 * w82[0]))             \
+    };                                                                        \
+    store(output + 0 * stride, w38 + w85);                                    \
+    store(output + 1 * stride,                                                \
+          w40[0] + (kWeight5 * w87[0] + kWeight6 * w87[1]));                  \
+    store(output + 2 * stride,                                                \
+          w41[0] + (kWeight3 * w88[0] + kWeight4 * w88[1]));                  \
+    store(output + 3 * stride,                                                \
+          w42[0] + (kWeight7 * w89[0] + kWeight8 * w89[1]));                  \
+    store(output + 4 * stride, w15 + (kWeight2 * w62 - kWeight2 * w81));      \
+    store(output + 5 * stride,                                                \
+          w44[0] + (kWeight8 * w91[0] + kWeight7 * w91[1]));                  \
+    store(output + 6 * stride,                                                \
+          w45[0] + (kWeight4 * w92[0] + kWeight3 * w92[1]));                  \
+    store(output + 7 * stride,                                                \
+          w46[0] + (kWeight6 * w93[0] + kWeight5 * w93[1]));                  \
+    store(output + 8 * stride, w39);                                          \
+    store(output + 9 * stride,                                                \
+          w46[0] + (-kWeight6 * w93[0] - kWeight5 * w93[1]));                 \
+    store(output + 10 * stride,                                               \
+          w45[0] + (-kWeight4 * w92[0] - kWeight3 * w92[1]));                 \
+    store(output + 11 * stride,                                               \
+          w44[0] + (-kWeight8 * w91[0] - kWeight7 * w91[1]));                 \
+    store(output + 12 * stride, w15 + (-kWeight2 * w62 + kWeight2 * w81));    \
+    store(output + 13 * stride,                                               \
+          w42[0] + (-kWeight7 * w89[0] - kWeight8 * w89[1]));                 \
+    store(output + 14 * stride,                                               \
+          w41[0] + (-kWeight3 * w88[0] - kWeight4 * w88[1]));                 \
+    store(output + 15 * stride,                                               \
+          w40[0] + (-kWeight5 * w87[0] - kWeight6 * w87[1]));                 \
+    store(output + 16 * stride, w38 - w85);                                   \
+    store(output + 17 * stride,                                               \
+          w40[1] + (kWeight5 * w87[1] - kWeight6 * w87[0]));                  \
+    store(output + 18 * stride,                                               \
+          w41[1] + (kWeight3 * w88[1] - kWeight4 * w88[0]));                  \
+    store(output + 19 * stride,                                               \
+          w42[1] + (kWeight7 * w89[1] - kWeight8 * w89[0]));                  \
+    store(output + 20 * stride, -w34 - (kWeight2 * w81 + kWeight2 * w62));    \
+    store(output + 21 * stride,                                               \
+          w44[1] + (kWeight8 * w91[1] - kWeight7 * w91[0]));                  \
+    store(output + 22 * stride,                                               \
+          w45[1] + (kWeight4 * w92[1] - kWeight3 * w92[0]));                  \
+    store(output + 23 * stride,                                               \
+          w46[1] + (kWeight6 * w93[1] - kWeight5 * w93[0]));                  \
+    store(output + 24 * stride, -w86);                                        \
+    store(output + 25 * stride,                                               \
+          -w46[1] - (-kWeight6 * w93[1] + kWeight5 * w93[0]));                \
+    store(output + 26 * stride,                                               \
+          -w45[1] - (-kWeight4 * w92[1] + kWeight3 * w92[0]));                \
+    store(output + 27 * stride,                                               \
+          -w44[1] - (-kWeight8 * w91[1] + kWeight7 * w91[0]));                \
+    store(output + 28 * stride, w34 + (-kWeight2 * w81 - kWeight2 * w62));    \
+    store(output + 29 * stride,                                               \
+          -w42[1] - (-kWeight7 * w89[1] + kWeight8 * w89[0]));                \
+    store(output + 30 * stride,                                               \
+          -w41[1] - (-kWeight3 * w88[1] + kWeight4 * w88[0]));                \
+    store(output + 31 * stride,                                               \
+          -w40[1] - (-kWeight5 * w87[1] + kWeight6 * w87[0]));                \
+  }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                       \
+    const T_VEC i1 = load(input + 1 * stride);                       \
+    store(output + 0 * stride, i0 + i1);                             \
+    store(output + 1 * stride, i0 - i1);                             \
+  }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                       \
+    const T_VEC i1 = load(input + 1 * stride);                       \
+    const T_VEC i2 = load(input + 2 * stride);                       \
+    const T_VEC i3 = load(input + 3 * stride);                       \
+    const T_VEC w2 = i0 + i2;                                        \
+    const T_VEC w3 = i0 - i2;                                        \
+    const T_VEC w4[2] = { (T_VEC)(i1 + i1), (T_VEC)(-i3 + i3) };     \
+    const T_VEC w5[2] = { (T_VEC)(i1 - i1), (T_VEC)(-i3 - i3) };     \
+    store(output + 0 * stride, w2 + w4[0]);                          \
+    store(output + 1 * stride, w3 + w5[1]);                          \
+    store(output + 2 * stride, w2 - w4[0]);                          \
+    store(output + 3 * stride, w3 - w5[1]);                          \
+  }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant)     \
+  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC kWeight2 = constant(0.707107f);                      \
+    const T_VEC i0 = load(input + 0 * stride);                       \
+    const T_VEC i1 = load(input + 1 * stride);                       \
+    const T_VEC i2 = load(input + 2 * stride);                       \
+    const T_VEC i3 = load(input + 3 * stride);                       \
+    const T_VEC i4 = load(input + 4 * stride);                       \
+    const T_VEC i5 = load(input + 5 * stride);                       \
+    const T_VEC i6 = load(input + 6 * stride);                       \
+    const T_VEC i7 = load(input + 7 * stride);                       \
+    const T_VEC w6 = i0 + i4;                                        \
+    const T_VEC w7 = i0 - i4;                                        \
+    const T_VEC w8[2] = { (T_VEC)(i2 + i2), (T_VEC)(-i6 + i6) };     \
+    const T_VEC w9[2] = { (T_VEC)(i2 - i2), (T_VEC)(-i6 - i6) };     \
+    const T_VEC w10[2] = { (T_VEC)(w6 + w8[0]), (T_VEC)(w8[1]) };    \
+    const T_VEC w11[2] = { (T_VEC)(w6 - w8[0]), (T_VEC)(-w8[1]) };   \
+    const T_VEC w12[2] = { (T_VEC)(w7 + w9[1]), (T_VEC)(-w9[0]) };   \
+    const T_VEC w13[2] = { (T_VEC)(w7 - w9[1]), (T_VEC)(w9[0]) };    \
+    const T_VEC w14[2] = { (T_VEC)(i1 + i3), (T_VEC)(-i5 + i7) };    \
+    const T_VEC w15[2] = { (T_VEC)(i1 - i3), (T_VEC)(-i5 - i7) };    \
+    const T_VEC w16[2] = { (T_VEC)(i3 + i1), (T_VEC)(-i7 + i5) };    \
+    const T_VEC w17[2] = { (T_VEC)(i3 - i1), (T_VEC)(-i7 - i5) };    \
+    const T_VEC w18[2] = { (T_VEC)(w14[0] + w16[0]),                 \
+                           (T_VEC)(w14[1] + w16[1]) };               \
+    const T_VEC w19[2] = { (T_VEC)(w14[0] - w16[0]),                 \
+                           (T_VEC)(w14[1] - w16[1]) };               \
+    const T_VEC w20[2] = { (T_VEC)(w15[0] + w17[1]),                 \
+                           (T_VEC)(w15[1] - w17[0]) };               \
+    const T_VEC w21[2] = { (T_VEC)(w15[0] - w17[1]),                 \
+                           (T_VEC)(w15[1] + w17[0]) };               \
+    store(output + 0 * stride, w10[0] + w18[0]);                     \
+    store(output + 1 * stride,                                       \
+          w12[0] + (kWeight2 * w20[0] + kWeight2 * w20[1]));         \
+    store(output + 2 * stride, w11[0] + w19[1]);                     \
+    store(output + 3 * stride,                                       \
+          w13[0] + (-kWeight2 * w21[0] + kWeight2 * w21[1]));        \
+    store(output + 4 * stride, w10[0] - w18[0]);                     \
+    store(output + 5 * stride,                                       \
+          w12[0] + (-kWeight2 * w20[0] - kWeight2 * w20[1]));        \
+    store(output + 6 * stride, w11[0] - w19[1]);                     \
+    store(output + 7 * stride,                                       \
+          w13[0] + (kWeight2 * w21[0] - kWeight2 * w21[1]));         \
+  }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant)     \
+  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC kWeight2 = constant(0.707107f);                       \
+    const T_VEC kWeight3 = constant(0.92388f);                        \
+    const T_VEC kWeight4 = constant(0.382683f);                       \
+    const T_VEC i0 = load(input + 0 * stride);                        \
+    const T_VEC i1 = load(input + 1 * stride);                        \
+    const T_VEC i2 = load(input + 2 * stride);                        \
+    const T_VEC i3 = load(input + 3 * stride);                        \
+    const T_VEC i4 = load(input + 4 * stride);                        \
+    const T_VEC i5 = load(input + 5 * stride);                        \
+    const T_VEC i6 = load(input + 6 * stride);                        \
+    const T_VEC i7 = load(input + 7 * stride);                        \
+    const T_VEC i8 = load(input + 8 * stride);                        \
+    const T_VEC i9 = load(input + 9 * stride);                        \
+    const T_VEC i10 = load(input + 10 * stride);                      \
+    const T_VEC i11 = load(input + 11 * stride);                      \
+    const T_VEC i12 = load(input + 12 * stride);                      \
+    const T_VEC i13 = load(input + 13 * stride);                      \
+    const T_VEC i14 = load(input + 14 * stride);                      \
+    const T_VEC i15 = load(input + 15 * stride);                      \
+    const T_VEC w14 = i0 + i8;                                        \
+    const T_VEC w15 = i0 - i8;                                        \
+    const T_VEC w16[2] = { (T_VEC)(i4 + i4), (T_VEC)(-i12 + i12) };   \
+    const T_VEC w17[2] = { (T_VEC)(i4 - i4), (T_VEC)(-i12 - i12) };   \
+    const T_VEC w18[2] = { (T_VEC)(w14 + w16[0]), (T_VEC)(w16[1]) };  \
+    const T_VEC w19[2] = { (T_VEC)(w14 - w16[0]), (T_VEC)(-w16[1]) }; \
+    const T_VEC w20[2] = { (T_VEC)(w15 + w17[1]), (T_VEC)(-w17[0]) }; \
+    const T_VEC w21[2] = { (T_VEC)(w15 - w17[1]), (T_VEC)(w17[0]) };  \
+    const T_VEC w22[2] = { (T_VEC)(i2 + i6), (T_VEC)(-i10 + i14) };   \
+    const T_VEC w23[2] = { (T_VEC)(i2 - i6), (T_VEC)(-i10 - i14) };   \
+    const T_VEC w24[2] = { (T_VEC)(i6 + i2), (T_VEC)(-i14 + i10) };   \
+    const T_VEC w25[2] = { (T_VEC)(i6 - i2), (T_VEC)(-i14 - i10) };   \
+    const T_VEC w26[2] = { (T_VEC)(w22[0] + w24[0]),                  \
+                           (T_VEC)(w22[1] + w24[1]) };                \
+    const T_VEC w27[2] = { (T_VEC)(w22[0] - w24[0]),                  \
+                           (T_VEC)(w22[1] - w24[1]) };                \
+    const T_VEC w28[2] = { (T_VEC)(w23[0] + w25[1]),                  \
+                           (T_VEC)(w23[1] - w25[0]) };                \
+    const T_VEC w29[2] = { (T_VEC)(w23[0] - w25[1]),                  \
+                           (T_VEC)(w23[1] + w25[0]) };                \
+    const T_VEC w30[2] = { (T_VEC)(w18[0] + w26[0]),                  \
+                           (T_VEC)(w18[1] + w26[1]) };                \
+    const T_VEC w31[2] = { (T_VEC)(w18[0] - w26[0]),                  \
+                           (T_VEC)(w18[1] - w26[1]) };                \
+    const T_VEC w32[2] = {                                            \
+      (T_VEC)(w20[0] + (kWeight2 * w28[0] + kWeight2 * w28[1])),      \
+      (T_VEC)(w20[1] + (kWeight2 * w28[1] - kWeight2 * w28[0]))       \
+    };                                                                \
+    const T_VEC w33[2] = {                                            \
+      (T_VEC)(w20[0] + (-kWeight2 * w28[0] - kWeight2 * w28[1])),     \
+      (T_VEC)(w20[1] + (-kWeight2 * w28[1] + kWeight2 * w28[0]))      \
+    };                                                                \
+    const T_VEC w34[2] = { (T_VEC)(w19[0] + w27[1]),                  \
+                           (T_VEC)(w19[1] - w27[0]) };                \
+    const T_VEC w35[2] = { (T_VEC)(w19[0] - w27[1]),                  \
+                           (T_VEC)(w19[1] + w27[0]) };                \
+    const T_VEC w36[2] = {                                            \
+      (T_VEC)(w21[0] + (-kWeight2 * w29[0] + kWeight2 * w29[1])),     \
+      (T_VEC)(w21[1] + (-kWeight2 * w29[1] - kWeight2 * w29[0]))      \
+    };                                                                \
+    const T_VEC w37[2] = {                                            \
+      (T_VEC)(w21[0] + (kWeight2 * w29[0] - kWeight2 * w29[1])),      \
+      (T_VEC)(w21[1] + (kWeight2 * w29[1] + kWeight2 * w29[0]))       \
+    };                                                                \
+    const T_VEC w38[2] = { (T_VEC)(i1 + i7), (T_VEC)(-i9 + i15) };    \
+    const T_VEC w39[2] = { (T_VEC)(i1 - i7), (T_VEC)(-i9 - i15) };    \
+    const T_VEC w40[2] = { (T_VEC)(i5 + i3), (T_VEC)(-i13 + i11) };   \
+    const T_VEC w41[2] = { (T_VEC)(i5 - i3), (T_VEC)(-i13 - i11) };   \
+    const T_VEC w42[2] = { (T_VEC)(w38[0] + w40[0]),                  \
+                           (T_VEC)(w38[1] + w40[1]) };                \
+    const T_VEC w43[2] = { (T_VEC)(w38[0] - w40[0]),                  \
+                           (T_VEC)(w38[1] - w40[1]) };                \
+    const T_VEC w44[2] = { (T_VEC)(w39[0] + w41[1]),                  \
+                           (T_VEC)(w39[1] - w41[0]) };                \
+    const T_VEC w45[2] = { (T_VEC)(w39[0] - w41[1]),                  \
+                           (T_VEC)(w39[1] + w41[0]) };                \
+    const T_VEC w46[2] = { (T_VEC)(i3 + i5), (T_VEC)(-i11 + i13) };   \
+    const T_VEC w47[2] = { (T_VEC)(i3 - i5), (T_VEC)(-i11 - i13) };   \
+    const T_VEC w48[2] = { (T_VEC)(i7 + i1), (T_VEC)(-i15 + i9) };    \
+    const T_VEC w49[2] = { (T_VEC)(i7 - i1), (T_VEC)(-i15 - i9) };    \
+    const T_VEC w50[2] = { (T_VEC)(w46[0] + w48[0]),                  \
+                           (T_VEC)(w46[1] + w48[1]) };                \
+    const T_VEC w51[2] = { (T_VEC)(w46[0] - w48[0]),                  \
+                           (T_VEC)(w46[1] - w48[1]) };                \
+    const T_VEC w52[2] = { (T_VEC)(w47[0] + w49[1]),                  \
+                           (T_VEC)(w47[1] - w49[0]) };                \
+    const T_VEC w53[2] = { (T_VEC)(w47[0] - w49[1]),                  \
+                           (T_VEC)(w47[1] + w49[0]) };                \
+    const T_VEC w54[2] = { (T_VEC)(w42[0] + w50[0]),                  \
+                           (T_VEC)(w42[1] + w50[1]) };                \
+    const T_VEC w55[2] = { (T_VEC)(w42[0] - w50[0]),                  \
+                           (T_VEC)(w42[1] - w50[1]) };                \
+    const T_VEC w56[2] = {                                            \
+      (T_VEC)(w44[0] + (kWeight2 * w52[0] + kWeight2 * w52[1])),      \
+      (T_VEC)(w44[1] + (kWeight2 * w52[1] - kWeight2 * w52[0]))       \
+    };                                                                \
+    const T_VEC w57[2] = {                                            \
+      (T_VEC)(w44[0] + (-kWeight2 * w52[0] - kWeight2 * w52[1])),     \
+      (T_VEC)(w44[1] + (-kWeight2 * w52[1] + kWeight2 * w52[0]))      \
+    };                                                                \
+    const T_VEC w58[2] = { (T_VEC)(w43[0] + w51[1]),                  \
+                           (T_VEC)(w43[1] - w51[0]) };                \
+    const T_VEC w59[2] = { (T_VEC)(w43[0] - w51[1]),                  \
+                           (T_VEC)(w43[1] + w51[0]) };                \
+    const T_VEC w60[2] = {                                            \
+      (T_VEC)(w45[0] + (-kWeight2 * w53[0] + kWeight2 * w53[1])),     \
+      (T_VEC)(w45[1] + (-kWeight2 * w53[1] - kWeight2 * w53[0]))      \
+    };                                                                \
+    const T_VEC w61[2] = {                                            \
+      (T_VEC)(w45[0] + (kWeight2 * w53[0] - kWeight2 * w53[1])),      \
+      (T_VEC)(w45[1] + (kWeight2 * w53[1] + kWeight2 * w53[0]))       \
+    };                                                                \
+    store(output + 0 * stride, w30[0] + w54[0]);                      \
+    store(output + 1 * stride,                                        \
+          w32[0] + (kWeight3 * w56[0] + kWeight4 * w56[1]));          \
+    store(output + 2 * stride,                                        \
+          w34[0] + (kWeight2 * w58[0] + kWeight2 * w58[1]));          \
+    store(output + 3 * stride,                                        \
+          w36[0] + (kWeight4 * w60[0] + kWeight3 * w60[1]));          \
+    store(output + 4 * stride, w31[0] + w55[1]);                      \
+    store(output + 5 * stride,                                        \
+          w33[0] + (-kWeight4 * w57[0] + kWeight3 * w57[1]));         \
+    store(output + 6 * stride,                                        \
+          w35[0] + (-kWeight2 * w59[0] + kWeight2 * w59[1]));         \
+    store(output + 7 * stride,                                        \
+          w37[0] + (-kWeight3 * w61[0] + kWeight4 * w61[1]));         \
+    store(output + 8 * stride, w30[0] - w54[0]);                      \
+    store(output + 9 * stride,                                        \
+          w32[0] + (-kWeight3 * w56[0] - kWeight4 * w56[1]));         \
+    store(output + 10 * stride,                                       \
+          w34[0] + (-kWeight2 * w58[0] - kWeight2 * w58[1]));         \
+    store(output + 11 * stride,                                       \
+          w36[0] + (-kWeight4 * w60[0] - kWeight3 * w60[1]));         \
+    store(output + 12 * stride, w31[0] - w55[1]);                     \
+    store(output + 13 * stride,                                       \
+          w33[0] + (kWeight4 * w57[0] - kWeight3 * w57[1]));          \
+    store(output + 14 * stride,                                       \
+          w35[0] + (kWeight2 * w59[0] - kWeight2 * w59[1]));          \
+    store(output + 15 * stride,                                       \
+          w37[0] + (kWeight3 * w61[0] - kWeight4 * w61[1]));          \
+  }
+
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant)     \
+  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC kWeight2 = constant(0.707107f);                       \
+    const T_VEC kWeight3 = constant(0.92388f);                        \
+    const T_VEC kWeight4 = constant(0.382683f);                       \
+    const T_VEC kWeight5 = constant(0.980785f);                       \
+    const T_VEC kWeight6 = constant(0.19509f);                        \
+    const T_VEC kWeight7 = constant(0.83147f);                        \
+    const T_VEC kWeight8 = constant(0.55557f);                        \
+    const T_VEC i0 = load(input + 0 * stride);                        \
+    const T_VEC i1 = load(input + 1 * stride);                        \
+    const T_VEC i2 = load(input + 2 * stride);                        \
+    const T_VEC i3 = load(input + 3 * stride);                        \
+    const T_VEC i4 = load(input + 4 * stride);                        \
+    const T_VEC i5 = load(input + 5 * stride);                        \
+    const T_VEC i6 = load(input + 6 * stride);                        \
+    const T_VEC i7 = load(input + 7 * stride);                        \
+    const T_VEC i8 = load(input + 8 * stride);                        \
+    const T_VEC i9 = load(input + 9 * stride);                        \
+    const T_VEC i10 = load(input + 10 * stride);                      \
+    const T_VEC i11 = load(input + 11 * stride);                      \
+    const T_VEC i12 = load(input + 12 * stride);                      \
+    const T_VEC i13 = load(input + 13 * stride);                      \
+    const T_VEC i14 = load(input + 14 * stride);                      \
+    const T_VEC i15 = load(input + 15 * stride);                      \
+    const T_VEC i16 = load(input + 16 * stride);                      \
+    const T_VEC i17 = load(input + 17 * stride);                      \
+    const T_VEC i18 = load(input + 18 * stride);                      \
+    const T_VEC i19 = load(input + 19 * stride);                      \
+    const T_VEC i20 = load(input + 20 * stride);                      \
+    const T_VEC i21 = load(input + 21 * stride);                      \
+    const T_VEC i22 = load(input + 22 * stride);                      \
+    const T_VEC i23 = load(input + 23 * stride);                      \
+    const T_VEC i24 = load(input + 24 * stride);                      \
+    const T_VEC i25 = load(input + 25 * stride);                      \
+    const T_VEC i26 = load(input + 26 * stride);                      \
+    const T_VEC i27 = load(input + 27 * stride);                      \
+    const T_VEC i28 = load(input + 28 * stride);                      \
+    const T_VEC i29 = load(input + 29 * stride);                      \
+    const T_VEC i30 = load(input + 30 * stride);                      \
+    const T_VEC i31 = load(input + 31 * stride);                      \
+    const T_VEC w30 = i0 + i16;                                       \
+    const T_VEC w31 = i0 - i16;                                       \
+    const T_VEC w32[2] = { (T_VEC)(i8 + i8), (T_VEC)(-i24 + i24) };   \
+    const T_VEC w33[2] = { (T_VEC)(i8 - i8), (T_VEC)(-i24 - i24) };   \
+    const T_VEC w34[2] = { (T_VEC)(w30 + w32[0]), (T_VEC)(w32[1]) };  \
+    const T_VEC w35[2] = { (T_VEC)(w30 - w32[0]), (T_VEC)(-w32[1]) }; \
+    const T_VEC w36[2] = { (T_VEC)(w31 + w33[1]), (T_VEC)(-w33[0]) }; \
+    const T_VEC w37[2] = { (T_VEC)(w31 - w33[1]), (T_VEC)(w33[0]) };  \
+    const T_VEC w38[2] = { (T_VEC)(i4 + i12), (T_VEC)(-i20 + i28) };  \
+    const T_VEC w39[2] = { (T_VEC)(i4 - i12), (T_VEC)(-i20 - i28) };  \
+    const T_VEC w40[2] = { (T_VEC)(i12 + i4), (T_VEC)(-i28 + i20) };  \
+    const T_VEC w41[2] = { (T_VEC)(i12 - i4), (T_VEC)(-i28 - i20) };  \
+    const T_VEC w42[2] = { (T_VEC)(w38[0] + w40[0]),                  \
+                           (T_VEC)(w38[1] + w40[1]) };                \
+    const T_VEC w43[2] = { (T_VEC)(w38[0] - w40[0]),                  \
+                           (T_VEC)(w38[1] - w40[1]) };                \
+    const T_VEC w44[2] = { (T_VEC)(w39[0] + w41[1]),                  \
+                           (T_VEC)(w39[1] - w41[0]) };                \
+    const T_VEC w45[2] = { (T_VEC)(w39[0] - w41[1]),                  \
+                           (T_VEC)(w39[1] + w41[0]) };                \
+    const T_VEC w46[2] = { (T_VEC)(w34[0] + w42[0]),                  \
+                           (T_VEC)(w34[1] + w42[1]) };                \
+    const T_VEC w47[2] = { (T_VEC)(w34[0] - w42[0]),                  \
+                           (T_VEC)(w34[1] - w42[1]) };                \
+    const T_VEC w48[2] = {                                            \
+      (T_VEC)(w36[0] + (kWeight2 * w44[0] + kWeight2 * w44[1])),      \
+      (T_VEC)(w36[1] + (kWeight2 * w44[1] - kWeight2 * w44[0]))       \
+    };                                                                \
+    const T_VEC w49[2] = {                                            \
+      (T_VEC)(w36[0] + (-kWeight2 * w44[0] - kWeight2 * w44[1])),     \
+      (T_VEC)(w36[1] + (-kWeight2 * w44[1] + kWeight2 * w44[0]))      \
+    };                                                                \
+    const T_VEC w50[2] = { (T_VEC)(w35[0] + w43[1]),                  \
+                           (T_VEC)(w35[1] - w43[0]) };                \
+    const T_VEC w51[2] = { (T_VEC)(w35[0] - w43[1]),                  \
+                           (T_VEC)(w35[1] + w43[0]) };                \
+    const T_VEC w52[2] = {                                            \
+      (T_VEC)(w37[0] + (-kWeight2 * w45[0] + kWeight2 * w45[1])),     \
+      (T_VEC)(w37[1] + (-kWeight2 * w45[1] - kWeight2 * w45[0]))      \
+    };                                                                \
+    const T_VEC w53[2] = {                                            \
+      (T_VEC)(w37[0] + (kWeight2 * w45[0] - kWeight2 * w45[1])),      \
+      (T_VEC)(w37[1] + (kWeight2 * w45[1] + kWeight2 * w45[0]))       \
+    };                                                                \
+    const T_VEC w54[2] = { (T_VEC)(i2 + i14), (T_VEC)(-i18 + i30) };  \
+    const T_VEC w55[2] = { (T_VEC)(i2 - i14), (T_VEC)(-i18 - i30) };  \
+    const T_VEC w56[2] = { (T_VEC)(i10 + i6), (T_VEC)(-i26 + i22) };  \
+    const T_VEC w57[2] = { (T_VEC)(i10 - i6), (T_VEC)(-i26 - i22) };  \
+    const T_VEC w58[2] = { (T_VEC)(w54[0] + w56[0]),                  \
+                           (T_VEC)(w54[1] + w56[1]) };                \
+    const T_VEC w59[2] = { (T_VEC)(w54[0] - w56[0]),                  \
+                           (T_VEC)(w54[1] - w56[1]) };                \
+    const T_VEC w60[2] = { (T_VEC)(w55[0] + w57[1]),                  \
+                           (T_VEC)(w55[1] - w57[0]) };                \
+    const T_VEC w61[2] = { (T_VEC)(w55[0] - w57[1]),                  \
+                           (T_VEC)(w55[1] + w57[0]) };                \
+    const T_VEC w62[2] = { (T_VEC)(i6 + i10), (T_VEC)(-i22 + i26) };  \
+    const T_VEC w63[2] = { (T_VEC)(i6 - i10), (T_VEC)(-i22 - i26) };  \
+    const T_VEC w64[2] = { (T_VEC)(i14 + i2), (T_VEC)(-i30 + i18) };  \
+    const T_VEC w65[2] = { (T_VEC)(i14 - i2), (T_VEC)(-i30 - i18) };  \
+    const T_VEC w66[2] = { (T_VEC)(w62[0] + w64[0]),                  \
+                           (T_VEC)(w62[1] + w64[1]) };                \
+    const T_VEC w67[2] = { (T_VEC)(w62[0] - w64[0]),                  \
+                           (T_VEC)(w62[1] - w64[1]) };                \
+    const T_VEC w68[2] = { (T_VEC)(w63[0] + w65[1]),                  \
+                           (T_VEC)(w63[1] - w65[0]) };                \
+    const T_VEC w69[2] = { (T_VEC)(w63[0] - w65[1]),                  \
+                           (T_VEC)(w63[1] + w65[0]) };                \
+    const T_VEC w70[2] = { (T_VEC)(w58[0] + w66[0]),                  \
+                           (T_VEC)(w58[1] + w66[1]) };                \
+    const T_VEC w71[2] = { (T_VEC)(w58[0] - w66[0]),                  \
+                           (T_VEC)(w58[1] - w66[1]) };                \
+    const T_VEC w72[2] = {                                            \
+      (T_VEC)(w60[0] + (kWeight2 * w68[0] + kWeight2 * w68[1])),      \
+      (T_VEC)(w60[1] + (kWeight2 * w68[1] - kWeight2 * w68[0]))       \
+    };                                                                \
+    const T_VEC w73[2] = {                                            \
+      (T_VEC)(w60[0] + (-kWeight2 * w68[0] - kWeight2 * w68[1])),     \
+      (T_VEC)(w60[1] + (-kWeight2 * w68[1] + kWeight2 * w68[0]))      \
+    };                                                                \
+    const T_VEC w74[2] = { (T_VEC)(w59[0] + w67[1]),                  \
+                           (T_VEC)(w59[1] - w67[0]) };                \
+    const T_VEC w75[2] = { (T_VEC)(w59[0] - w67[1]),                  \
+                           (T_VEC)(w59[1] + w67[0]) };                \
+    const T_VEC w76[2] = {                                            \
+      (T_VEC)(w61[0] + (-kWeight2 * w69[0] + kWeight2 * w69[1])),     \
+      (T_VEC)(w61[1] + (-kWeight2 * w69[1] - kWeight2 * w69[0]))      \
+    };                                                                \
+    const T_VEC w77[2] = {                                            \
+      (T_VEC)(w61[0] + (kWeight2 * w69[0] - kWeight2 * w69[1])),      \
+      (T_VEC)(w61[1] + (kWeight2 * w69[1] + kWeight2 * w69[0]))       \
+    };                                                                \
+    const T_VEC w78[2] = { (T_VEC)(w46[0] + w70[0]),                  \
+                           (T_VEC)(w46[1] + w70[1]) };                \
+    const T_VEC w79[2] = { (T_VEC)(w46[0] - w70[0]),                  \
+                           (T_VEC)(w46[1] - w70[1]) };                \
+    const T_VEC w80[2] = {                                            \
+      (T_VEC)(w48[0] + (kWeight3 * w72[0] + kWeight4 * w72[1])),      \
+      (T_VEC)(w48[1] + (kWeight3 * w72[1] - kWeight4 * w72[0]))       \
+    };                                                                \
+    const T_VEC w81[2] = {                                            \
+      (T_VEC)(w48[0] + (-kWeight3 * w72[0] - kWeight4 * w72[1])),     \
+      (T_VEC)(w48[1] + (-kWeight3 * w72[1] + kWeight4 * w72[0]))      \
+    };                                                                \
+    const T_VEC w82[2] = {                                            \
+      (T_VEC)(w50[0] + (kWeight2 * w74[0] + kWeight2 * w74[1])),      \
+      (T_VEC)(w50[1] + (kWeight2 * w74[1] - kWeight2 * w74[0]))       \
+    };                                                                \
+    const T_VEC w83[2] = {                                            \
+      (T_VEC)(w50[0] + (-kWeight2 * w74[0] - kWeight2 * w74[1])),     \
+      (T_VEC)(w50[1] + (-kWeight2 * w74[1] + kWeight2 * w74[0]))      \
+    };                                                                \
+    const T_VEC w84[2] = {                                            \
+      (T_VEC)(w52[0] + (kWeight4 * w76[0] + kWeight3 * w76[1])),      \
+      (T_VEC)(w52[1] + (kWeight4 * w76[1] - kWeight3 * w76[0]))       \
+    };                                                                \
+    const T_VEC w85[2] = {                                            \
+      (T_VEC)(w52[0] + (-kWeight4 * w76[0] - kWeight3 * w76[1])),     \
+      (T_VEC)(w52[1] + (-kWeight4 * w76[1] + kWeight3 * w76[0]))      \
+    };                                                                \
+    const T_VEC w86[2] = { (T_VEC)(w47[0] + w71[1]),                  \
+                           (T_VEC)(w47[1] - w71[0]) };                \
+    const T_VEC w87[2] = { (T_VEC)(w47[0] - w71[1]),                  \
+                           (T_VEC)(w47[1] + w71[0]) };                \
+    const T_VEC w88[2] = {                                            \
+      (T_VEC)(w49[0] + (-kWeight4 * w73[0] + kWeight3 * w73[1])),     \
+      (T_VEC)(w49[1] + (-kWeight4 * w73[1] - kWeight3 * w73[0]))      \
+    };                                                                \
+    const T_VEC w89[2] = {                                            \
+      (T_VEC)(w49[0] + (kWeight4 * w73[0] - kWeight3 * w73[1])),      \
+      (T_VEC)(w49[1] + (kWeight4 * w73[1] + kWeight3 * w73[0]))       \
+    };                                                                \
+    const T_VEC w90[2] = {                                            \
+      (T_VEC)(w51[0] + (-kWeight2 * w75[0] + kWeight2 * w75[1])),     \
+      (T_VEC)(w51[1] + (-kWeight2 * w75[1] - kWeight2 * w75[0]))      \
+    };                                                                \
+    const T_VEC w91[2] = {                                            \
+      (T_VEC)(w51[0] + (kWeight2 * w75[0] - kWeight2 * w75[1])),      \
+      (T_VEC)(w51[1] + (kWeight2 * w75[1] + kWeight2 * w75[0]))       \
+    };                                                                \
+    const T_VEC w92[2] = {                                            \
+      (T_VEC)(w53[0] + (-kWeight3 * w77[0] + kWeight4 * w77[1])),     \
+      (T_VEC)(w53[1] + (-kWeight3 * w77[1] - kWeight4 * w77[0]))      \
+    };                                                                \
+    const T_VEC w93[2] = {                                            \
+      (T_VEC)(w53[0] + (kWeight3 * w77[0] - kWeight4 * w77[1])),      \
+      (T_VEC)(w53[1] + (kWeight3 * w77[1] + kWeight4 * w77[0]))       \
+    };                                                                \
+    const T_VEC w94[2] = { (T_VEC)(i1 + i15), (T_VEC)(-i17 + i31) };  \
+    const T_VEC w95[2] = { (T_VEC)(i1 - i15), (T_VEC)(-i17 - i31) };  \
+    const T_VEC w96[2] = { (T_VEC)(i9 + i7), (T_VEC)(-i25 + i23) };   \
+    const T_VEC w97[2] = { (T_VEC)(i9 - i7), (T_VEC)(-i25 - i23) };   \
+    const T_VEC w98[2] = { (T_VEC)(w94[0] + w96[0]),                  \
+                           (T_VEC)(w94[1] + w96[1]) };                \
+    const T_VEC w99[2] = { (T_VEC)(w94[0] - w96[0]),                  \
+                           (T_VEC)(w94[1] - w96[1]) };                \
+    const T_VEC w100[2] = { (T_VEC)(w95[0] + w97[1]),                 \
+                            (T_VEC)(w95[1] - w97[0]) };               \
+    const T_VEC w101[2] = { (T_VEC)(w95[0] - w97[1]),                 \
+                            (T_VEC)(w95[1] + w97[0]) };               \
+    const T_VEC w102[2] = { (T_VEC)(i5 + i11), (T_VEC)(-i21 + i27) }; \
+    const T_VEC w103[2] = { (T_VEC)(i5 - i11), (T_VEC)(-i21 - i27) }; \
+    const T_VEC w104[2] = { (T_VEC)(i13 + i3), (T_VEC)(-i29 + i19) }; \
+    const T_VEC w105[2] = { (T_VEC)(i13 - i3), (T_VEC)(-i29 - i19) }; \
+    const T_VEC w106[2] = { (T_VEC)(w102[0] + w104[0]),               \
+                            (T_VEC)(w102[1] + w104[1]) };             \
+    const T_VEC w107[2] = { (T_VEC)(w102[0] - w104[0]),               \
+                            (T_VEC)(w102[1] - w104[1]) };             \
+    const T_VEC w108[2] = { (T_VEC)(w103[0] + w105[1]),               \
+                            (T_VEC)(w103[1] - w105[0]) };             \
+    const T_VEC w109[2] = { (T_VEC)(w103[0] - w105[1]),               \
+                            (T_VEC)(w103[1] + w105[0]) };             \
+    const T_VEC w110[2] = { (T_VEC)(w98[0] + w106[0]),                \
+                            (T_VEC)(w98[1] + w106[1]) };              \
+    const T_VEC w111[2] = { (T_VEC)(w98[0] - w106[0]),                \
+                            (T_VEC)(w98[1] - w106[1]) };              \
+    const T_VEC w112[2] = {                                           \
+      (T_VEC)(w100[0] + (kWeight2 * w108[0] + kWeight2 * w108[1])),   \
+      (T_VEC)(w100[1] + (kWeight2 * w108[1] - kWeight2 * w108[0]))    \
+    };                                                                \
+    const T_VEC w113[2] = {                                           \
+      (T_VEC)(w100[0] + (-kWeight2 * w108[0] - kWeight2 * w108[1])),  \
+      (T_VEC)(w100[1] + (-kWeight2 * w108[1] + kWeight2 * w108[0]))   \
+    };                                                                \
+    const T_VEC w114[2] = { (T_VEC)(w99[0] + w107[1]),                \
+                            (T_VEC)(w99[1] - w107[0]) };              \
+    const T_VEC w115[2] = { (T_VEC)(w99[0] - w107[1]),                \
+                            (T_VEC)(w99[1] + w107[0]) };              \
+    const T_VEC w116[2] = {                                           \
+      (T_VEC)(w101[0] + (-kWeight2 * w109[0] + kWeight2 * w109[1])),  \
+      (T_VEC)(w101[1] + (-kWeight2 * w109[1] - kWeight2 * w109[0]))   \
+    };                                                                \
+    const T_VEC w117[2] = {                                           \
+      (T_VEC)(w101[0] + (kWeight2 * w109[0] - kWeight2 * w109[1])),   \
+      (T_VEC)(w101[1] + (kWeight2 * w109[1] + kWeight2 * w109[0]))    \
+    };                                                                \
+    const T_VEC w118[2] = { (T_VEC)(i3 + i13), (T_VEC)(-i19 + i29) }; \
+    const T_VEC w119[2] = { (T_VEC)(i3 - i13), (T_VEC)(-i19 - i29) }; \
+    const T_VEC w120[2] = { (T_VEC)(i11 + i5), (T_VEC)(-i27 + i21) }; \
+    const T_VEC w121[2] = { (T_VEC)(i11 - i5), (T_VEC)(-i27 - i21) }; \
+    const T_VEC w122[2] = { (T_VEC)(w118[0] + w120[0]),               \
+                            (T_VEC)(w118[1] + w120[1]) };             \
+    const T_VEC w123[2] = { (T_VEC)(w118[0] - w120[0]),               \
+                            (T_VEC)(w118[1] - w120[1]) };             \
+    const T_VEC w124[2] = { (T_VEC)(w119[0] + w121[1]),               \
+                            (T_VEC)(w119[1] - w121[0]) };             \
+    const T_VEC w125[2] = { (T_VEC)(w119[0] - w121[1]),               \
+                            (T_VEC)(w119[1] + w121[0]) };             \
+    const T_VEC w126[2] = { (T_VEC)(i7 + i9), (T_VEC)(-i23 + i25) };  \
+    const T_VEC w127[2] = { (T_VEC)(i7 - i9), (T_VEC)(-i23 - i25) };  \
+    const T_VEC w128[2] = { (T_VEC)(i15 + i1), (T_VEC)(-i31 + i17) }; \
+    const T_VEC w129[2] = { (T_VEC)(i15 - i1), (T_VEC)(-i31 - i17) }; \
+    const T_VEC w130[2] = { (T_VEC)(w126[0] + w128[0]),               \
+                            (T_VEC)(w126[1] + w128[1]) };             \
+    const T_VEC w131[2] = { (T_VEC)(w126[0] - w128[0]),               \
+                            (T_VEC)(w126[1] - w128[1]) };             \
+    const T_VEC w132[2] = { (T_VEC)(w127[0] + w129[1]),               \
+                            (T_VEC)(w127[1] - w129[0]) };             \
+    const T_VEC w133[2] = { (T_VEC)(w127[0] - w129[1]),               \
+                            (T_VEC)(w127[1] + w129[0]) };             \
+    const T_VEC w134[2] = { (T_VEC)(w122[0] + w130[0]),               \
+                            (T_VEC)(w122[1] + w130[1]) };             \
+    const T_VEC w135[2] = { (T_VEC)(w122[0] - w130[0]),               \
+                            (T_VEC)(w122[1] - w130[1]) };             \
+    const T_VEC w136[2] = {                                           \
+      (T_VEC)(w124[0] + (kWeight2 * w132[0] + kWeight2 * w132[1])),   \
+      (T_VEC)(w124[1] + (kWeight2 * w132[1] - kWeight2 * w132[0]))    \
+    };                                                                \
+    const T_VEC w137[2] = {                                           \
+      (T_VEC)(w124[0] + (-kWeight2 * w132[0] - kWeight2 * w132[1])),  \
+      (T_VEC)(w124[1] + (-kWeight2 * w132[1] + kWeight2 * w132[0]))   \
+    };                                                                \
+    const T_VEC w138[2] = { (T_VEC)(w123[0] + w131[1]),               \
+                            (T_VEC)(w123[1] - w131[0]) };             \
+    const T_VEC w139[2] = { (T_VEC)(w123[0] - w131[1]),               \
+                            (T_VEC)(w123[1] + w131[0]) };             \
+    const T_VEC w140[2] = {                                           \
+      (T_VEC)(w125[0] + (-kWeight2 * w133[0] + kWeight2 * w133[1])),  \
+      (T_VEC)(w125[1] + (-kWeight2 * w133[1] - kWeight2 * w133[0]))   \
+    };                                                                \
+    const T_VEC w141[2] = {                                           \
+      (T_VEC)(w125[0] + (kWeight2 * w133[0] - kWeight2 * w133[1])),   \
+      (T_VEC)(w125[1] + (kWeight2 * w133[1] + kWeight2 * w133[0]))    \
+    };                                                                \
+    const T_VEC w142[2] = { (T_VEC)(w110[0] + w134[0]),               \
+                            (T_VEC)(w110[1] + w134[1]) };             \
+    const T_VEC w143[2] = { (T_VEC)(w110[0] - w134[0]),               \
+                            (T_VEC)(w110[1] - w134[1]) };             \
+    const T_VEC w144[2] = {                                           \
+      (T_VEC)(w112[0] + (kWeight3 * w136[0] + kWeight4 * w136[1])),   \
+      (T_VEC)(w112[1] + (kWeight3 * w136[1] - kWeight4 * w136[0]))    \
+    };                                                                \
+    const T_VEC w145[2] = {                                           \
+      (T_VEC)(w112[0] + (-kWeight3 * w136[0] - kWeight4 * w136[1])),  \
+      (T_VEC)(w112[1] + (-kWeight3 * w136[1] + kWeight4 * w136[0]))   \
+    };                                                                \
+    const T_VEC w146[2] = {                                           \
+      (T_VEC)(w114[0] + (kWeight2 * w138[0] + kWeight2 * w138[1])),   \
+      (T_VEC)(w114[1] + (kWeight2 * w138[1] - kWeight2 * w138[0]))    \
+    };                                                                \
+    const T_VEC w147[2] = {                                           \
+      (T_VEC)(w114[0] + (-kWeight2 * w138[0] - kWeight2 * w138[1])),  \
+      (T_VEC)(w114[1] + (-kWeight2 * w138[1] + kWeight2 * w138[0]))   \
+    };                                                                \
+    const T_VEC w148[2] = {                                           \
+      (T_VEC)(w116[0] + (kWeight4 * w140[0] + kWeight3 * w140[1])),   \
+      (T_VEC)(w116[1] + (kWeight4 * w140[1] - kWeight3 * w140[0]))    \
+    };                                                                \
+    const T_VEC w149[2] = {                                           \
+      (T_VEC)(w116[0] + (-kWeight4 * w140[0] - kWeight3 * w140[1])),  \
+      (T_VEC)(w116[1] + (-kWeight4 * w140[1] + kWeight3 * w140[0]))   \
+    };                                                                \
+    const T_VEC w150[2] = { (T_VEC)(w111[0] + w135[1]),               \
+                            (T_VEC)(w111[1] - w135[0]) };             \
+    const T_VEC w151[2] = { (T_VEC)(w111[0] - w135[1]),               \
+                            (T_VEC)(w111[1] + w135[0]) };             \
+    const T_VEC w152[2] = {                                           \
+      (T_VEC)(w113[0] + (-kWeight4 * w137[0] + kWeight3 * w137[1])),  \
+      (T_VEC)(w113[1] + (-kWeight4 * w137[1] - kWeight3 * w137[0]))   \
+    };                                                                \
+    const T_VEC w153[2] = {                                           \
+      (T_VEC)(w113[0] + (kWeight4 * w137[0] - kWeight3 * w137[1])),   \
+      (T_VEC)(w113[1] + (kWeight4 * w137[1] + kWeight3 * w137[0]))    \
+    };                                                                \
+    const T_VEC w154[2] = {                                           \
+      (T_VEC)(w115[0] + (-kWeight2 * w139[0] + kWeight2 * w139[1])),  \
+      (T_VEC)(w115[1] + (-kWeight2 * w139[1] - kWeight2 * w139[0]))   \
+    };                                                                \
+    const T_VEC w155[2] = {                                           \
+      (T_VEC)(w115[0] + (kWeight2 * w139[0] - kWeight2 * w139[1])),   \
+      (T_VEC)(w115[1] + (kWeight2 * w139[1] + kWeight2 * w139[0]))    \
+    };                                                                \
+    const T_VEC w156[2] = {                                           \
+      (T_VEC)(w117[0] + (-kWeight3 * w141[0] + kWeight4 * w141[1])),  \
+      (T_VEC)(w117[1] + (-kWeight3 * w141[1] - kWeight4 * w141[0]))   \
+    };                                                                \
+    const T_VEC w157[2] = {                                           \
+      (T_VEC)(w117[0] + (kWeight3 * w141[0] - kWeight4 * w141[1])),   \
+      (T_VEC)(w117[1] + (kWeight3 * w141[1] + kWeight4 * w141[0]))    \
+    };                                                                \
+    store(output + 0 * stride, w78[0] + w142[0]);                     \
+    store(output + 1 * stride,                                        \
+          w80[0] + (kWeight5 * w144[0] + kWeight6 * w144[1]));        \
+    store(output + 2 * stride,                                        \
+          w82[0] + (kWeight3 * w146[0] + kWeight4 * w146[1]));        \
+    store(output + 3 * stride,                                        \
+          w84[0] + (kWeight7 * w148[0] + kWeight8 * w148[1]));        \
+    store(output + 4 * stride,                                        \
+          w86[0] + (kWeight2 * w150[0] + kWeight2 * w150[1]));        \
+    store(output + 5 * stride,                                        \
+          w88[0] + (kWeight8 * w152[0] + kWeight7 * w152[1]));        \
+    store(output + 6 * stride,                                        \
+          w90[0] + (kWeight4 * w154[0] + kWeight3 * w154[1]));        \
+    store(output + 7 * stride,                                        \
+          w92[0] + (kWeight6 * w156[0] + kWeight5 * w156[1]));        \
+    store(output + 8 * stride, w79[0] + w143[1]);                     \
+    store(output + 9 * stride,                                        \
+          w81[0] + (-kWeight6 * w145[0] + kWeight5 * w145[1]));       \
+    store(output + 10 * stride,                                       \
+          w83[0] + (-kWeight4 * w147[0] + kWeight3 * w147[1]));       \
+    store(output + 11 * stride,                                       \
+          w85[0] + (-kWeight8 * w149[0] + kWeight7 * w149[1]));       \
+    store(output + 12 * stride,                                       \
+          w87[0] + (-kWeight2 * w151[0] + kWeight2 * w151[1]));       \
+    store(output + 13 * stride,                                       \
+          w89[0] + (-kWeight7 * w153[0] + kWeight8 * w153[1]));       \
+    store(output + 14 * stride,                                       \
+          w91[0] + (-kWeight3 * w155[0] + kWeight4 * w155[1]));       \
+    store(output + 15 * stride,                                       \
+          w93[0] + (-kWeight5 * w157[0] + kWeight6 * w157[1]));       \
+    store(output + 16 * stride, w78[0] - w142[0]);                    \
+    store(output + 17 * stride,                                       \
+          w80[0] + (-kWeight5 * w144[0] - kWeight6 * w144[1]));       \
+    store(output + 18 * stride,                                       \
+          w82[0] + (-kWeight3 * w146[0] - kWeight4 * w146[1]));       \
+    store(output + 19 * stride,                                       \
+          w84[0] + (-kWeight7 * w148[0] - kWeight8 * w148[1]));       \
+    store(output + 20 * stride,                                       \
+          w86[0] + (-kWeight2 * w150[0] - kWeight2 * w150[1]));       \
+    store(output + 21 * stride,                                       \
+          w88[0] + (-kWeight8 * w152[0] - kWeight7 * w152[1]));       \
+    store(output + 22 * stride,                                       \
+          w90[0] + (-kWeight4 * w154[0] - kWeight3 * w154[1]));       \
+    store(output + 23 * stride,                                       \
+          w92[0] + (-kWeight6 * w156[0] - kWeight5 * w156[1]));       \
+    store(output + 24 * stride, w79[0] - w143[1]);                    \
+    store(output + 25 * stride,                                       \
+          w81[0] + (kWeight6 * w145[0] - kWeight5 * w145[1]));        \
+    store(output + 26 * stride,                                       \
+          w83[0] + (kWeight4 * w147[0] - kWeight3 * w147[1]));        \
+    store(output + 27 * stride,                                       \
+          w85[0] + (kWeight8 * w149[0] - kWeight7 * w149[1]));        \
+    store(output + 28 * stride,                                       \
+          w87[0] + (kWeight2 * w151[0] - kWeight2 * w151[1]));        \
+    store(output + 29 * stride,                                       \
+          w89[0] + (kWeight7 * w153[0] - kWeight8 * w153[1]));        \
+    store(output + 30 * stride,                                       \
+          w91[0] + (kWeight3 * w155[0] - kWeight4 * w155[1]));        \
+    store(output + 31 * stride,                                       \
+          w93[0] + (kWeight5 * w157[0] - kWeight6 * w157[1]));        \
+  }
+
+#endif  // AOM_DSP_FFT_COMMON_H_
diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index 117185c..6c0cf62 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c
@@ -18,6 +18,7 @@
 #include "aom_dsp/noise_model.h"
 #include "aom_dsp/noise_util.h"
 #include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
 #include "av1/encoder/mathutils.h"
 
 #define kLowPolyNumParams 3
@@ -1268,3 +1269,192 @@
   film_grain->overlap_flag = 1;
   return 1;
 }
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+  for (int i = 0; i < n; ++i) {
+    b[i] *= a[i];
+  }
+}
+
+static float *get_half_cos_window(int block_size) {
+  float *window_function =
+      (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+  for (int y = 0; y < block_size; ++y) {
+    const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+    for (int x = 0; x < block_size; ++x) {
+      const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+      window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+    }
+  }
+  return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix)                               \
+  static void dither_and_quantize_##suffix(                                 \
+      float *result, int result_stride, INT_TYPE *denoised, int w, int h,   \
+      int stride, int chroma_sub_w, int chroma_sub_h, int block_size,       \
+      float block_normalization) {                                          \
+    for (int y = 0; y < (h >> chroma_sub_h); ++y) {                         \
+      for (int x = 0; x < (w >> chroma_sub_w); ++x) {                       \
+        const int result_idx =                                              \
+            (y + (block_size >> chroma_sub_h)) * result_stride + x +        \
+            (block_size >> chroma_sub_w);                                   \
+        INT_TYPE new_val = (INT_TYPE)AOMMIN(                                \
+            AOMMAX(result[result_idx] * block_normalization + 0.5f, 0),     \
+            block_normalization);                                           \
+        const float err =                                                   \
+            -(((float)new_val) / block_normalization - result[result_idx]); \
+        denoised[y * stride + x] = new_val;                                 \
+        if (x + 1 < (w >> chroma_sub_w)) {                                  \
+          result[result_idx + 1] += err * 7.0f / 16.0f;                     \
+        }                                                                   \
+        if (y + 1 < (h >> chroma_sub_h)) {                                  \
+          if (x > 0) {                                                      \
+            result[result_idx + result_stride - 1] += err * 3.0f / 16.0f;   \
+          }                                                                 \
+          result[result_idx + result_stride] += err * 5.0f / 16.0f;         \
+          if (x + 1 < (w >> chroma_sub_w)) {                                \
+            result[result_idx + result_stride + 1] += err * 1.0f / 16.0f;   \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd);
+DITHER_AND_QUANTIZE(uint16_t, highbd);
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd) {
+  float *plane = NULL, *block = NULL, *window_full = NULL,
+        *window_chroma = NULL;
+  double *block_d = NULL, *plane_d = NULL;
+  struct aom_noise_tx_t *tx_full = NULL;
+  struct aom_noise_tx_t *tx_chroma = NULL;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  const int result_stride = (num_blocks_w + 2) * block_size;
+  const int result_height = (num_blocks_h + 2) * block_size;
+  float *result = NULL;
+  int init_success = 1;
+  aom_flat_block_finder_t block_finder_full;
+  aom_flat_block_finder_t block_finder_chroma;
+  const float kBlockNormalization = (1 << bit_depth) - 1;
+  if (chroma_sub[0] != chroma_sub[1]) {
+    fprintf(stderr,
+            "aom_wiener_denoise_2d doesn't handle different chroma "
+            "subsampling");
+    return 0;
+  }
+  init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+                                             bit_depth, use_highbd);
+  result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+                               sizeof(*result));
+  plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+  block =
+      (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+  block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+  plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+  window_full = get_half_cos_window(block_size);
+  tx_full = aom_noise_tx_malloc(block_size);
+
+  if (chroma_sub[0] != 0) {
+    init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+                                               block_size >> chroma_sub[0],
+                                               bit_depth, use_highbd);
+    window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+    tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+  } else {
+    window_chroma = window_full;
+    tx_chroma = tx_full;
+  }
+
+  init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+                  (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+                  (window_full != NULL) && (window_chroma != NULL) &&
+                  (result != NULL);
+  for (int c = init_success ? 0 : 3; c < 3; ++c) {
+    float *window_function = c == 0 ? window_full : window_chroma;
+    aom_flat_block_finder_t *block_finder = &block_finder_full;
+    const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+    const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+    struct aom_noise_tx_t *tx =
+        (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+    if (!data[c] || !denoised[c]) continue;
+    if (c > 0 && chroma_sub[0] != 0) {
+      block_finder = &block_finder_chroma;
+    }
+    memset(result, 0, sizeof(*result) * result_stride * result_height);
+    // Do overlapped block processing (half overlapped). The block rows can
+    // easily be done in parallel
+    for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+         offsy += (block_size >> chroma_sub_h) / 2) {
+      for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+           offsx += (block_size >> chroma_sub_w) / 2) {
+        // Pad the boundary when processing each block-set.
+        for (int by = -1; by < num_blocks_h; ++by) {
+          for (int bx = -1; bx < num_blocks_w; ++bx) {
+            const int pixels_per_block =
+                (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+            aom_flat_block_finder_extract_block(
+                block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+                stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+                by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+            for (int j = 0; j < pixels_per_block; ++j) {
+              block[j] = (float)block_d[j];
+              plane[j] = (float)plane_d[j];
+            }
+            pointwise_multiply(window_function, block, pixels_per_block);
+            aom_noise_tx_forward(tx, block);
+            aom_noise_tx_filter(tx, noise_psd[c]);
+            aom_noise_tx_inverse(tx, block);
+
+            // Apply window function to the plane approximation (we will apply
+            // it to the sum of plane + block when composing the results).
+            pointwise_multiply(window_function, plane, pixels_per_block);
+
+            for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+              const int y_result =
+                  y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+              for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+                const int x_result =
+                    x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+                result[y_result * result_stride + x_result] +=
+                    (block[y * (block_size >> chroma_sub_w) + x] +
+                     plane[y * (block_size >> chroma_sub_w) + x]) *
+                    window_function[y * (block_size >> chroma_sub_w) + x];
+              }
+            }
+          }
+        }
+      }
+    }
+    if (use_highbd) {
+      dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+                                 w, h, stride[c], chroma_sub_w, chroma_sub_h,
+                                 block_size, kBlockNormalization);
+    } else {
+      dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+                                stride[c], chroma_sub_w, chroma_sub_h,
+                                block_size, kBlockNormalization);
+    }
+  }
+  aom_free(result);
+  aom_free(plane);
+  aom_free(block);
+  aom_free(plane_d);
+  aom_free(block_d);
+  aom_free(window_full);
+
+  aom_noise_tx_free(tx_full);
+
+  aom_flat_block_finder_free(&block_finder_full);
+  if (chroma_sub[0] != 0) {
+    aom_flat_block_finder_free(&block_finder_chroma);
+    aom_free(window_chroma);
+    aom_noise_tx_free(tx_chroma);
+  }
+  return init_success;
+}
diff --git a/aom_dsp/noise_model.h b/aom_dsp/noise_model.h
index 34a364d..1e0fbb6 100644
--- a/aom_dsp/noise_model.h
+++ b/aom_dsp/noise_model.h
@@ -261,6 +261,26 @@
 int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
                                          aom_film_grain_t *film_grain);
 
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in]     data            Raw frame data
+ * \param[out]    denoised        Denoised frame data
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     stride          Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     noise_psd       The power spectral density of the noise
+ * \param[in]     block_size      The size of blocks
+ * \param[in]     bit_depth       Bit depth of the image
+ * \param[in]     use_highbd      If true, uint8 pointers are interpreted as
+ *                                uint16 and stride is measured in uint16.
+ *                                This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub_log2[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/aom_dsp/noise_util.c b/aom_dsp/noise_util.c
index c81874c..db4ca07 100644
--- a/aom_dsp/noise_util.c
+++ b/aom_dsp/noise_util.c
@@ -13,9 +13,120 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
 #include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+  return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+  float *tx_block;
+  float *temp;
+  int block_size;
+  void (*fft)(const float *, float *, float *);
+  void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+  struct aom_noise_tx_t *noise_tx =
+      (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+  if (!noise_tx) return NULL;
+  memset(noise_tx, 0, sizeof(*noise_tx));
+  switch (block_size) {
+    case 2:
+      noise_tx->fft = aom_fft2x2_float;
+      noise_tx->ifft = aom_ifft2x2_float;
+      break;
+    case 4:
+      noise_tx->fft = aom_fft4x4_float;
+      noise_tx->ifft = aom_ifft4x4_float;
+      break;
+    case 8:
+      noise_tx->fft = aom_fft8x8_float;
+      noise_tx->ifft = aom_ifft8x8_float;
+      break;
+    case 16:
+      noise_tx->fft = aom_fft16x16_float;
+      noise_tx->ifft = aom_ifft16x16_float;
+      break;
+    case 32:
+      noise_tx->fft = aom_fft32x32_float;
+      noise_tx->ifft = aom_ifft32x32_float;
+      break;
+    default:
+      aom_free(noise_tx);
+      fprintf(stderr, "Unsupported block size %d\n", block_size);
+      return NULL;
+  }
+  noise_tx->block_size = block_size;
+  noise_tx->tx_block = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  noise_tx->temp = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  if (!noise_tx->tx_block || !noise_tx->temp) {
+    aom_noise_tx_free(noise_tx);
+    return NULL;
+  }
+  return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+  noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+  const int block_size = noise_tx->block_size;
+  const float kBeta = 1.1f;
+  const float kEps = 1e-6f;
+  for (int y = 0; y < block_size; ++y) {
+    for (int x = 0; x < block_size; ++x) {
+      int i = y * block_size + x;
+      float *c = noise_tx->tx_block + 2 * i;
+      const float p = c[0] * c[0] + c[1] * c[1];
+      if (p > kBeta * psd[i] && p > 1e-6) {
+        noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+        noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+      } else {
+        noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+        noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+      }
+    }
+  }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+  const int n = noise_tx->block_size * noise_tx->block_size;
+  noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+  for (int i = 0; i < n; ++i) {
+    data[i] /= n;
+  }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+                             float *psd) {
+  const int block_size = noise_tx->block_size;
+  for (int yb = 0; yb < block_size; ++yb) {
+    for (int xb = 0; xb <= block_size / 2; ++xb) {
+      float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+      psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+    }
+  }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+  if (!noise_tx) return;
+  aom_free(noise_tx->tx_block);
+  aom_free(noise_tx->temp);
+  aom_free(noise_tx);
+}
 
 double aom_normalized_cross_correlation(const double *a, const double *b,
                                         int n) {
diff --git a/aom_dsp/noise_util.h b/aom_dsp/noise_util.h
index f3366d8..ea4d9e3 100644
--- a/aom_dsp/noise_util.h
+++ b/aom_dsp/noise_util.h
@@ -16,6 +16,44 @@
 extern "C" {
 #endif  // __cplusplus
 
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+                          const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+                             float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
 // Computes normalized cross correlation of two vectors a and b of length n.
 double aom_normalized_cross_correlation(const double *a, const double *b,
                                         int n);
diff --git a/aom_dsp/x86/fft_avx2.c b/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 0000000..ff2a539
--- /dev/null
+++ b/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+                                          int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+          _mm256_store_ps, _mm256_set1_ps);
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps);
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps);
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps);
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps);
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps);
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+                  aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+                  aom_transpose_float_sse2, 8);
+}
diff --git a/aom_dsp/x86/fft_sse2.c b/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 0000000..0447e5a
--- /dev/null
+++ b/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+                                const int ldb) {
+  __m128 row1 = _mm_load_ps(&A[0 * lda]);
+  __m128 row2 = _mm_load_ps(&A[1 * lda]);
+  __m128 row3 = _mm_load_ps(&A[2 * lda]);
+  __m128 row4 = _mm_load_ps(&A[3 * lda]);
+  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+  _mm_store_ps(&B[0 * ldb], row1);
+  _mm_store_ps(&B[1 * ldb], row2);
+  _mm_store_ps(&B[2 * ldb], row3);
+  _mm_store_ps(&B[3 * ldb], row4);
+}
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y += 4) {
+    for (int x = 0; x < n; x += 4) {
+      transpose4x4(A + y * n + x, B + x * n + y, n, n);
+    }
+  }
+}
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+  const int n2 = n / 2;
+  output[0] = packed[0];
+  output[1] = 0;
+  output[2 * (n2 * n)] = packed[n2 * n];
+  output[2 * (n2 * n) + 1] = 0;
+
+  output[2 * n2] = packed[n2];
+  output[2 * n2 + 1] = 0;
+  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+  output[2 * (n2 * n + n2) + 1] = 0;
+
+  for (int c = 1; c < n2; ++c) {
+    output[2 * (0 * n + c)] = packed[c];
+    output[2 * (0 * n + c) + 1] = packed[c + n2];
+    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+  }
+  for (int r = 1; r < n2; ++r) {
+    output[2 * (r * n + 0)] = packed[r * n];
+    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+      output[2 * (r * n + c)] =
+          packed[r * n + c] - packed[(r + n2) * n + c + n2];
+      output[2 * (r * n + c) + 1] =
+          packed[(r + n2) * n + c] + packed[r * n + c + n2];
+    }
+
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+      real1 = real1 - real2;
+      imag1 = imag1 + imag2;
+      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+    }
+
+    int r2 = r + n2;
+    int r3 = n - r2;
+    output[2 * (r2 * n + 0)] = packed[r3 * n];
+    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+    for (int c = 1; c < AOMMIN(4, n2); ++c) {
+      output[2 * (r2 * n + c)] =
+          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+      output[2 * (r2 * n + c) + 1] =
+          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+    }
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+      real1 = real1 + real2;
+      imag1 = imag2 - imag1;
+      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r2 * n + c + 2),
+                   _mm_unpackhi_ps(real1, imag1));
+    }
+  }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps);
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps);
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps);
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps);
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps);
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps);
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps);
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps);
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+                  aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+                  aom_transpose_float_sse2, 4);
+}
diff --git a/test/fft_test.cc b/test/fft_test.cc
new file mode 100644
index 0000000..1ab9a11
--- /dev/null
+++ b/test/fft_test.cc
@@ -0,0 +1,240 @@
+#include <math.h>
+
+#include <algorithm>
+#include <complex>
+#include <vector>
+
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/x86.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Simple 1D FFT implementation
+template <typename InputType>
+void fft(const InputType *data, std::complex<float> *result, int n) {
+  if (n == 1) {
+    result[0] = data[0];
+    return;
+  }
+  std::vector<InputType> temp(n);
+  for (int k = 0; k < n / 2; ++k) {
+    temp[k] = data[2 * k];
+    temp[n / 2 + k] = data[2 * k + 1];
+  }
+  fft(&temp[0], result, n / 2);
+  fft(&temp[n / 2], result + n / 2, n / 2);
+  for (int k = 0; k < n / 2; ++k) {
+    std::complex<float> w = std::complex<float>((float)cos(2. * M_PI * k / n),
+                                                (float)-sin(2. * M_PI * k / n));
+    std::complex<float> a = result[k];
+    std::complex<float> b = result[n / 2 + k];
+    result[k] = a + w * b;
+    result[n / 2 + k] = a - w * b;
+  }
+}
+
+void transpose(std::vector<std::complex<float> > *data, int n) {
+  for (int y = 0; y < n; ++y) {
+    for (int x = y + 1; x < n; ++x) {
+      std::swap((*data)[y * n + x], (*data)[x * n + y]);
+    }
+  }
+}
+
+// Simple 2D FFT implementation
+template <class InputType>
+std::vector<std::complex<float> > fft2d(const InputType *input, int n) {
+  std::vector<std::complex<float> > rowfft(n * n);
+  std::vector<std::complex<float> > result(n * n);
+  for (int y = 0; y < n; ++y) {
+    fft(input + y * n, &rowfft[y * n], n);
+  }
+  transpose(&rowfft, n);
+  for (int y = 0; y < n; ++y) {
+    fft(&rowfft[y * n], &result[y * n], n);
+  }
+  transpose(&result, n);
+  return result;
+}
+
+struct FFTTestArg {
+  int n;
+  void (*fft)(const float *input, float *temp, float *output);
+  int flag;
+};
+
+class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
+ protected:
+  void SetUp() {
+    int n = GetParam().n;
+    input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
+    temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
+    output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n * 2);
+    memset(input_, 0, sizeof(*input_) * n * n);
+    memset(temp_, 0, sizeof(*temp_) * n * n);
+    memset(output_, 0, sizeof(*output_) * n * n * 2);
+#if ARCH_X86_64
+    disabled_ = GetParam().flag != 0 && !(x86_simd_caps() & GetParam().flag);
+#endif
+  }
+  void TearDown() {
+    aom_free(input_);
+    aom_free(temp_);
+    aom_free(output_);
+  }
+  int disabled_;
+  float *input_;
+  float *temp_;
+  float *output_;
+};
+
+TEST_P(FFT2DTest, Correct) {
+  if (disabled_) return;
+
+  int n = GetParam().n;
+  for (int i = 0; i < n * n; ++i) {
+    input_[i] = 1;
+    std::vector<std::complex<float> > expected = fft2d<float>(&input_[0], n);
+    GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+    for (int y = 0; y < n; ++y) {
+      for (int x = 0; x < (n / 2) + 1; ++x) {
+        EXPECT_NEAR(expected[y * n + x].real(), output_[2 * (y * n + x)], 1e-5);
+        EXPECT_NEAR(expected[y * n + x].imag(), output_[2 * (y * n + x) + 1],
+                    1e-5);
+      }
+    }
+    input_[i] = 0;
+  }
+}
+
+TEST_P(FFT2DTest, Benchmark) {
+  if (disabled_) return;
+
+  int n = GetParam().n;
+  float sum = 0;
+  for (int i = 0; i < 1000 * (64 - n); ++i) {
+    input_[i % (n * n)] = 1;
+    GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+    sum += output_[0];
+    input_[i % (n * n)] = 0;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FFT2DTestC, FFT2DTest,
+    ::testing::Values((FFTTestArg){ 2, aom_fft2x2_float_c, 0 },
+                      (FFTTestArg){ 4, aom_fft4x4_float_c, 0 },
+                      (FFTTestArg){ 8, aom_fft8x8_float_c, 0 },
+                      (FFTTestArg){ 16, aom_fft16x16_float_c, 0 },
+                      (FFTTestArg){ 32, aom_fft32x32_float_c, 0 }));
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    FFT2DTestSSE2, FFT2DTest,
+    ::testing::Values((FFTTestArg){ 4, aom_fft4x4_float_sse2, HAS_SSE2 },
+                      (FFTTestArg){ 8, aom_fft8x8_float_sse2, HAS_SSE2 },
+                      (FFTTestArg){ 16, aom_fft16x16_float_sse2, HAS_SSE2 },
+                      (FFTTestArg){ 32, aom_fft32x32_float_sse2, HAS_SSE2 }));
+
+INSTANTIATE_TEST_CASE_P(
+    FFT2DTestAVX2, FFT2DTest,
+    ::testing::Values((FFTTestArg){ 8, aom_fft8x8_float_avx2, HAS_AVX2 },
+                      (FFTTestArg){ 16, aom_fft16x16_float_avx2, HAS_AVX2 },
+                      (FFTTestArg){ 32, aom_fft32x32_float_avx2, HAS_AVX2 }));
+#endif
+
+struct IFFTTestArg {
+  int n;
+  void (*ifft)(const float *input, float *temp, float *output);
+  int flag;
+};
+
+class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
+ protected:
+  void SetUp() {
+    int n = GetParam().n;
+    input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
+    temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
+    output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n);
+    memset(input_, 0, sizeof(*input_) * n * n * 2);
+    memset(temp_, 0, sizeof(*temp_) * n * n * 2);
+    memset(output_, 0, sizeof(*output_) * n * n);
+#if ARCH_X86_64
+    disabled_ = GetParam().flag != 0 && !(x86_simd_caps() & GetParam().flag);
+#endif
+  }
+  void TearDown() {
+    aom_free(input_);
+    aom_free(temp_);
+    aom_free(output_);
+  }
+  int disabled_;
+  float *input_;
+  float *temp_;
+  float *output_;
+};
+
+TEST_P(IFFT2DTest, Correctness) {
+  if (disabled_) return;
+  int n = GetParam().n;
+  ASSERT_GE(n, 2);
+  std::vector<float> expected(n * n);
+  std::vector<float> actual(n * n);
+  // Do forward transform then invert to make sure we get back expected
+  for (int y = 0; y < n; ++y) {
+    for (int x = 0; x < n; ++x) {
+      expected[y * n + x] = 1;
+      std::vector<std::complex<float> > input_c = fft2d(&expected[0], n);
+      for (int i = 0; i < n * n; ++i) {
+        input_[2 * i + 0] = input_c[i].real();
+        input_[2 * i + 1] = input_c[i].imag();
+      }
+      GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+
+      for (int yy = 0; yy < n; ++yy) {
+        for (int xx = 0; xx < n; ++xx) {
+          EXPECT_NEAR(expected[yy * n + xx], output_[yy * n + xx] / (n * n),
+                      1e-5);
+        }
+      }
+      expected[y * n + x] = 0;
+    }
+  }
+};
+
+TEST_P(IFFT2DTest, Benchmark) {
+  if (disabled_) return;
+  int n = GetParam().n;
+  float sum = 0;
+  for (int i = 0; i < 1000 * (64 - n); ++i) {
+    input_[i % (n * n)] = 1;
+    GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+    sum += output_[0];
+    input_[i % (n * n)] = 0;
+  }
+}
+INSTANTIATE_TEST_CASE_P(
+    IFFT2DTestC, IFFT2DTest,
+    ::testing::Values((IFFTTestArg){ 2, aom_ifft2x2_float_c, 0 },
+                      (IFFTTestArg){ 4, aom_ifft4x4_float_c, 0 },
+                      (IFFTTestArg){ 8, aom_ifft8x8_float_c, 0 },
+                      (IFFTTestArg){ 16, aom_ifft16x16_float_c, 0 },
+                      (IFFTTestArg){ 32, aom_ifft32x32_float_c, 0 }));
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    IFFT2DTestSSE2, IFFT2DTest,
+    ::testing::Values((IFFTTestArg){ 4, aom_ifft4x4_float_sse2, HAS_SSE2 },
+                      (IFFTTestArg){ 8, aom_ifft8x8_float_sse2, HAS_SSE2 },
+                      (IFFTTestArg){ 16, aom_ifft16x16_float_sse2, HAS_SSE2 },
+                      (IFFTTestArg){ 32, aom_ifft32x32_float_sse2, HAS_SSE2 }));
+
+INSTANTIATE_TEST_CASE_P(
+    IFFT2DTestAVX2, IFFT2DTest,
+    ::testing::Values((IFFTTestArg){ 8, aom_ifft8x8_float_avx2, HAS_AVX2 },
+                      (IFFTTestArg){ 16, aom_ifft16x16_float_avx2, HAS_AVX2 },
+                      (IFFTTestArg){ 32, aom_ifft32x32_float_avx2, HAS_AVX2 }));
+#endif
+}  // namespace
diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index d454c31..26c4aa6 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -4,6 +4,7 @@
 
 #include "aom_dsp/noise_model.h"
 #include "aom_dsp/noise_util.h"
+#include "config/aom_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -60,6 +61,45 @@
   }
 }
 
+std::vector<float> get_noise_psd(double *noise, int width, int height,
+                                 int block_size) {
+  float *block =
+      (float *)aom_memalign(32, block_size * block_size * sizeof(block));
+  std::vector<float> psd(block_size * block_size);
+  int num_blocks = 0;
+  struct aom_noise_tx_t *tx = aom_noise_tx_malloc(block_size);
+  for (int y = 0; y <= height - block_size; y += block_size / 2) {
+    for (int x = 0; x <= width - block_size; x += block_size / 2) {
+      for (int yy = 0; yy < block_size; ++yy) {
+        for (int xx = 0; xx < block_size; ++xx) {
+          block[yy * block_size + xx] = (float)noise[(y + yy) * width + x + xx];
+        }
+      }
+      aom_noise_tx_forward(tx, &block[0]);
+      aom_noise_tx_add_energy(tx, &psd[0]);
+      num_blocks++;
+    }
+  }
+  for (int yy = 0; yy < block_size; ++yy) {
+    for (int xx = 0; xx <= block_size / 2; ++xx) {
+      psd[yy * block_size + xx] /= num_blocks;
+    }
+  }
+  // Fill in the data that is missing due to symmetries
+  for (int xx = 1; xx < block_size / 2; ++xx) {
+    psd[(block_size - xx)] = psd[xx];
+  }
+  for (int yy = 1; yy < block_size; ++yy) {
+    for (int xx = 1; xx < block_size / 2; ++xx) {
+      psd[(block_size - yy) * block_size + (block_size - xx)] =
+          psd[yy * block_size + xx];
+    }
+  }
+  aom_noise_tx_free(tx);
+  aom_free(block);
+  return psd;
+}
+
 }  // namespace
 
 TEST(NoiseStrengthSolver, GetCentersTwoBins) {
@@ -1098,3 +1138,193 @@
 
   aom_noise_model_free(&model);
 }
+
+template <typename T>
+class WienerDenoiseTest : public ::testing::Test, public T {
+ public:
+  static void SetUpTestCase() { aom_dsp_rtcd(); }
+
+ protected:
+  void SetUp() {
+    static const float kNoiseLevel = 5.f;
+    static const float kStd = 4.0;
+    static const double kMaxValue = (1 << T::kBitDepth) - 1;
+
+    chroma_sub_[0] = 1;
+    chroma_sub_[1] = 1;
+    stride_[0] = kWidth;
+    stride_[1] = kWidth / 2;
+    stride_[2] = kWidth / 2;
+    for (int k = 0; k < 3; ++k) {
+      data_[k].resize(kWidth * kHeight);
+      denoised_[k].resize(kWidth * kHeight);
+      noise_psd_[k].resize(kBlockSize * kBlockSize);
+    }
+
+    const double kCoeffsY[] = { 0.0406, -0.116, -0.078, -0.152, 0.0033, -0.093,
+                                0.048,  0.404,  0.2353, -0.035, -0.093, 0.441 };
+    const int kCoords[12][2] = {
+      { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 }, { 2, -2 }, { -2, -1 },
+      { -1, -1 }, { 0, -1 },  { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 }
+    };
+    const int kLag = 2;
+    const int kLength = 12;
+    libaom_test::ACMRandom random;
+    std::vector<double> noise(kWidth * kHeight);
+    noise_synth(&random, kLag, kLength, kCoords, kCoeffsY, &noise[0], kWidth,
+                kHeight);
+    noise_psd_[0] = get_noise_psd(&noise[0], kWidth, kHeight, kBlockSize);
+    for (int i = 0; i < kBlockSize * kBlockSize; ++i) {
+      noise_psd_[0][i] = (float)(noise_psd_[0][i] * kStd * kStd * kScaleNoise *
+                                 kScaleNoise / (kMaxValue * kMaxValue));
+    }
+
+    float psd_value =
+        aom_noise_psd_get_default_value(kBlockSizeChroma, kNoiseLevel);
+    for (int i = 0; i < kBlockSizeChroma * kBlockSizeChroma; ++i) {
+      noise_psd_[1][i] = psd_value;
+      noise_psd_[2][i] = psd_value;
+    }
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kWidth; ++x) {
+        data_[0][y * stride_[0] + x] = (typename T::data_type_t)fclamp(
+            (x + noise[y * stride_[0] + x] * kStd) * kScaleNoise, 0, kMaxValue);
+      }
+    }
+
+    for (int c = 1; c < 3; ++c) {
+      for (int y = 0; y < (kHeight >> 1); ++y) {
+        for (int x = 0; x < (kWidth >> 1); ++x) {
+          data_[c][y * stride_[c] + x] = (typename T::data_type_t)fclamp(
+              (x + randn(&random, kStd)) * kScaleNoise, 0, kMaxValue);
+        }
+      }
+    }
+    for (int k = 0; k < 3; ++k) {
+      noise_psd_ptrs_[k] = &noise_psd_[k][0];
+    }
+  }
+  static const int kBlockSize = 32;
+  static const int kBlockSizeChroma = 16;
+  static const int kWidth = 256;
+  static const int kHeight = 256;
+  static const int kScaleNoise = 1 << (T::kBitDepth - 8);
+
+  std::vector<typename T::data_type_t> data_[3];
+  std::vector<typename T::data_type_t> denoised_[3];
+  std::vector<float> noise_psd_[3];
+  int chroma_sub_[2];
+  float *noise_psd_ptrs_[3];
+  int stride_[3];
+};
+
+TYPED_TEST_CASE_P(WienerDenoiseTest);
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidBlockSize) {
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     18, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     48, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     64, this->kBitDepth, this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidChromaSubsampling) {
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  int chroma_sub[2] = { 1, 0 };
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_, chroma_sub,
+                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
+                                     this->kUseHighBD));
+
+  chroma_sub[0] = 0;
+  chroma_sub[1] = 1;
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_, chroma_sub,
+                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
+                                     this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kBlockSize = this->kBlockSize;
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  const int ret = aom_wiener_denoise_2d(
+      data_ptrs, denoised_ptrs, kWidth, kHeight, this->stride_,
+      this->chroma_sub_, this->noise_psd_ptrs_, this->kBlockSize,
+      this->kBitDepth, this->kUseHighBD);
+  EXPECT_EQ(1, ret);
+
+  // Check the noise on the denoised image (from the analytical gradient)
+  // and make sure that it is less than what we added.
+  for (int c = 0; c < 3; ++c) {
+    std::vector<double> measured_noise(kWidth * kHeight);
+
+    double var = 0;
+    for (int x = 0; x<kWidth>> (c > 0); ++x) {
+      for (int y = 0; y<kHeight>> (c > 0); ++y) {
+        const double diff = this->denoised_[c][y * this->stride_[c] + x] -
+                            x * this->kScaleNoise;
+        var += diff * diff;
+        measured_noise[y * kWidth + x] = diff;
+      }
+    }
+    var /= (kWidth * kHeight);
+    const double std = sqrt(std::max(0.0, var));
+    EXPECT_LE(std, 1.25f * this->kScaleNoise);
+    if (c == 0) {
+      std::vector<float> measured_psd =
+          get_noise_psd(&measured_noise[0], kWidth, kHeight, kBlockSize);
+      std::vector<double> measured_psd_d(kBlockSize * kBlockSize);
+      std::vector<double> noise_psd_d(kBlockSize * kBlockSize);
+      std::copy(measured_psd.begin(), measured_psd.end(),
+                measured_psd_d.begin());
+      std::copy(this->noise_psd_[0].begin(), this->noise_psd_[0].end(),
+                noise_psd_d.begin());
+      EXPECT_LT(aom_normalized_cross_correlation(
+                    &measured_psd_d[0], &noise_psd_d[0], noise_psd_d.size()),
+                0.35);
+    }
+  }
+}
+
+REGISTER_TYPED_TEST_CASE_P(WienerDenoiseTest, InvalidBlockSize,
+                           InvalidChromaSubsampling, GradientTest);
+
+INSTANTIATE_TYPED_TEST_CASE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
+                              AllBitDepthParams);
diff --git a/test/test.cmake b/test/test.cmake
index 73ddd34..7720c34 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -173,6 +173,7 @@
         "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
         "${AOM_ROOT}/test/blend_a64_mask_test.cc"
         "${AOM_ROOT}/test/error_block_test.cc"
+        "${AOM_ROOT}/test/fft_test.cc"
         "${AOM_ROOT}/test/fwht4x4_test.cc"
         "${AOM_ROOT}/test/masked_sad_test.cc"
         "${AOM_ROOT}/test/masked_variance_test.cc"