Use CMake to build TF-Lite

This enables Windows builds. Update minimum requirements in
documentation.

BUG=aomedia:2613

Change-Id: I8f66ad9e733d905b98c3448e58f10d100cbac142
diff --git a/.gitattributes b/.gitattributes
index 0bb5faf..ffc6912 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -16,8 +16,3 @@
 *.pjt        -crlf
 *.vcp        -crlf
 *.inf        -crlf
-# fft2d is not available through git, and the code
-# is checked in verbatim. Do not apply diffing to
-# it, otherwise it fails the 'git diff HEAD^ --check'
-# test.
-third_party/tensorflow_dependencies/fft2d/** -diff
diff --git a/.gitmodules b/.gitmodules
index fae903e..b1cb803 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,27 +1,3 @@
 [submodule "third_party/tensorflow"]
 	path = third_party/tensorflow
 	url = https://github.com/tensorflow/tensorflow.git
-[submodule "third_party/tensorflow_dependencies/absl"]
-	path = third_party/tensorflow_dependencies/absl
-	url = https://github.com/abseil/abseil-cpp.git
-[submodule "third_party/tensorflow_dependencies/neon_2_sse"]
-	path = third_party/tensorflow_dependencies/neon_2_sse
-	url = https://github.com/intel/ARM_NEON_2_x86_SSE.git
-[submodule "third_party/tensorflow_dependencies/farmhash"]
-	path = third_party/tensorflow_dependencies/farmhash
-	url = https://github.com/google/farmhash
-[submodule "third_party/tensorflow_dependencies/eigen"]
-	path = third_party/tensorflow_dependencies/eigen
-	url = https://gitlab.com/libeigen/eigen.git
-[submodule "third_party/tensorflow_dependencies/flatbuffers"]
-	path = third_party/tensorflow_dependencies/flatbuffers
-	url = https://github.com/google/flatbuffers
-[submodule "third_party/tensorflow_dependencies/gemmlowp"]
-	path = third_party/tensorflow_dependencies/gemmlowp
-	url = https://github.com/google/gemmlowp.git
-[submodule "third_party/tensorflow_dependencies/fp16"]
-	path = third_party/tensorflow_dependencies/fp16
-	url = https://github.com/Maratyszcza/FP16.git
-[submodule "third_party/tensorflow_dependencies/ruy"]
-	path = third_party/tensorflow_dependencies/ruy
-	url = https://github.com/google/ruy.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3fb214d..2d96d6c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@
 set(SO_VERSION 2)
 set(SO_FILE_VERSION 2.0.0)
 
+include("${AOM_ROOT}/build/cmake/tensorflow_lite.cmake")
 include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
 include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake")
 include("${AOM_ROOT}/aom_mem/aom_mem.cmake")
@@ -629,19 +630,19 @@
 endif()
 
 if(CONFIG_TENSORFLOW_LITE)
-  include("${AOM_ROOT}/build/cmake/tensorflow_lite.cmake")
-  setup_tensorflow_lite()
-
-  if(ENABLE_EXAMPLES)
-    add_executable(tf_lite_model "${AOM_ROOT}/examples/tf_lite_model.cc")
-    list(APPEND AOM_EXAMPLE_TARGETS tf_lite_model)
-    list(APPEND AOM_APP_TARGETS tf_lite_model)
-    add_dependencies(tf_lite_model tensorflowlite_a)
-    target_link_libraries(
-      tf_lite_model
-      PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libtensorflow-lite.a"
-      PRIVATE Threads::Threads
-      PRIVATE ${CMAKE_DL_LIBS})
+  set(tflite_supported 0)
+  is_tflite_supported(tflite_supported)
+  if(NOT ${tflite_supported})
+    set(CONFIG_TENSORFLOW_LITE 0)
+  else()
+    setup_tensorflow_lite()
+    if(ENABLE_EXAMPLES)
+      add_executable(tf_lite_model "${AOM_ROOT}/examples/tf_lite_model.cc")
+      list(APPEND AOM_EXAMPLE_TARGETS tf_lite_model)
+      list(APPEND AOM_APP_TARGETS tf_lite_model)
+      target_link_tf_lite_libraries(tf_lite_model)
+      add_dependencies(tf_lite_model tensorflow_lite)
+    endif()
   endif()
 endif()
 
diff --git a/README.md b/README.md
index c0fa329..2dbf5db 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@
 
 ### Prerequisites {#prerequisites}
 
- 1. [CMake](https://cmake.org) version 3.5 or higher.
+ 1. [CMake](https://cmake.org) version 3.16 or higher.
  2. [Git](https://git-scm.com/).
  3. [Perl](https://www.perl.org/).
  4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a
@@ -217,7 +217,7 @@
 ### Microsoft Visual Studio builds {#microsoft-visual-studio-builds}
 
 Building the AV1 codec library in Microsoft Visual Studio is supported. Visual
-Studio 2017 (15.0) or later is required. The following example demonstrates
+Studio 2019 (16.7) or later is required. The following example demonstrates
 generating projects and a solution for the Microsoft IDE:
 
 ~~~
@@ -231,14 +231,6 @@
     # To build with Visual Studio 2019 for the 32-bit x86 target:
     $ cmake path/to/aom -G "Visual Studio 16 2019" -A Win32
     $ cmake --build .
-
-    # To build with Visual Studio 2017 for the x64 target:
-    $ cmake path/to/aom -G "Visual Studio 15 2017" -T host=x64 -A x64
-    $ cmake --build .
-
-    # To build with Visual Studio 2017 for the 32-bit x86 target:
-    $ cmake path/to/aom -G "Visual Studio 15 2017" -T host=x64
-    $ cmake --build .
 ~~~
 
 NOTE: The build system targets Windows 7 or later by compiling files with
diff --git a/build/cmake/tensorflow_lite.cmake b/build/cmake/tensorflow_lite.cmake
index 9fca969..5db1aee 100644
--- a/build/cmake/tensorflow_lite.cmake
+++ b/build/cmake/tensorflow_lite.cmake
@@ -13,31 +13,22 @@
 endif() # AOM_BUILD_CMAKE_TENSORFLOW_LITE_CMAKE_
 set(AOM_BUILD_CMAKE_TENSORFLOW_LITE_CMAKE_ 1)
 
+include(ExternalProject)
 include(FindGit)
 
-# Checks if the dependencies on Tensorflow Lite are already checked out -- if
-# not, uses the git submodule command to fetch them.
-function(checkout_submodules_)
+# Checks if Tensorflow has been checked out -- if not, uses the git submodule
+# command to fetch it.
+function(checkout_submodule_)
   # As a quick sanity check, see if at least 1 expected file or directory is
   # present in each submodule. If so, assume they are all checked out (if they
   # are not, then the base directory will be empty).
-  if(
-    (EXISTS "${AOM_ROOT}/third_party/tensorflow/tensorflow")
-    AND (EXISTS
-         "${AOM_ROOT}/third_party/tensorflow_dependencies/neon_2_sse/ReadMe.md")
-    AND (EXISTS "${AOM_ROOT}/third_party/tensorflow_aom/absl/absl")
-    AND (EXISTS "${AOM_ROOT}/third_party/tensorflow_aom/eigen/Eigen")
-    AND (EXISTS "${AOM_ROOT}/third_party/tensorflow_aom/farmhash/Makefile.am")
-    AND (EXISTS "${AOM_ROOT}/third_party/tensorflow_aom/flatbuffers/BUILD")
-    AND (EXISTS "${AOM_ROOT}/third_party/tensorflow_aom/fp16/CMakeLists.txt")
-    AND (EXISTS "${AOM_ROOT}/third_party/tensorflow_aom/gemmlowp/BUILD")
-    AND (EXISTS "${AOM_ROOT}/third_party/tensorflow_aom/ruy/BUILD"))
+  if(EXISTS "${AOM_ROOT}/third_party/tensorflow/tensorflow")
     return()
   endif()
   if(NOT GIT_FOUND)
     message(
       FATAL_ERROR
-        "Tensorflow-Lite/dependencies not present; " "git could not be found; "
+        "Tensorflow-Lite not present; " "git could not be found; "
         "please check out submodules with 'git submodule update --init'")
   endif()
   # Note that "git submodule update --init" must be run from inside the git
@@ -55,43 +46,183 @@
   endif()
 endfunction()
 
-function(add_tensorflow_lite_dependency_)
-  if(NOT AOM_APP_TARGETS)
-    message(FATAL_ERROR "AOM_APP_TARGETS variable must not be empty.")
+# Add the TF-lite link-related library to the named target (e.g., an executable
+# or library). This function handles the different naming conventions of
+# operating systems.
+function(target_link_tf_lite_dep_ named_target subdir libname)
+  if(NOT (("${subdir}" STREQUAL "") OR ("${subdir}" MATCHES "/$")))
+    message(
+      FATAL_ERROR "sub-directory must be empty or end with a slash: ${subdir}")
   endif()
-  # Build the library.
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/libtensorflow-lite.a"
-    COMMAND "${AOM_ROOT}/third_party/tensorflow_dependencies/build.pl"
-            "${AOM_ROOT}" "${CMAKE_CURRENT_BINARY_DIR}/libtensorflow-lite.a"
-    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
-  add_custom_target(tensorflowlite_a ALL
-                    DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libtensorflow-lite.a")
-  include_directories("${AOM_ROOT}/third_party/tensorflow")
-  include_directories(
-    "${AOM_ROOT}/third_party/tensorflow_dependencies/flatbuffers/include/")
-  # Add tensorflow-lite as a dependency on all AOM applications.
-  foreach(aom_app ${AOM_APP_TARGETS})
-    add_dependencies(${aom_app} tensorflowlite_a)
-    target_link_libraries(
-      ${aom_app}
-      PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libtensorflow-lite.a"
-              ${AOM_LIB_LINK_TYPE} Threads::Threads
-      PRIVATE ${CMAKE_DL_LIBS})
-  endforeach()
+
+  set(STATIC_LIBRARY_DIR "")
+  if(MSVC)
+    set(STATIC_LIBRARY_DIR "$<CONFIG>/")
+  endif()
+  target_link_libraries(
+    ${named_target}
+    PRIVATE
+      "${CMAKE_BINARY_DIR}/tensorflow_lite/${subdir}${STATIC_LIBRARY_DIR}${CMAKE_STATIC_LIBRARY_PREFIX}${libname}${CMAKE_STATIC_LIBRARY_SUFFIX}"
+    )
 endfunction()
 
-# If Tensorflow-Lite should be enabled, adds appropriate build rules / targets.
-function(setup_tensorflow_lite)
-  if("${AOM_ROOT}" STREQUAL "")
-    message(FATAL_ERROR "AOM_ROOT variable must not be empty.")
-  endif()
+# Add TF-lite libraries onto the named target at link time (e.g., an executable
+# or library). For enabling TF-lite for experiments, prefer the
+# "experiment_requires_tf_lite" function.
+function(target_link_tf_lite_libraries named_target)
+  target_link_libraries(${named_target} ${AOM_LIB_LINK_TYPE} Threads::Threads)
+  target_link_libraries(${named_target} PRIVATE ${CMAKE_DL_LIBS})
+  target_link_tf_lite_dep_(${named_target} "" tensorflow-lite)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/flags/
+                           absl_flags)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/flags/
+                           absl_flags_internal)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/flags/
+                           absl_flags_registry)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/flags/
+                           absl_flags_config)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/flags/
+                           absl_flags_program_name)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/flags/
+                           absl_flags_marshalling)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/hash/
+                           absl_hash)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/hash/
+                           absl_city)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/status/
+                           absl_status)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/types/
+                           absl_bad_optional_access)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/strings/
+                           absl_cord)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/strings/
+                           absl_str_format_internal)
+  target_link_tf_lite_dep_(${named_target}
+                           _deps/abseil-cpp-build/absl/synchronization/
+                           absl_synchronization)
+  target_link_tf_lite_dep_(${named_target}
+                           _deps/abseil-cpp-build/absl/debugging/
+                           absl_stacktrace)
+  target_link_tf_lite_dep_(${named_target}
+                           _deps/abseil-cpp-build/absl/debugging/
+                           absl_symbolize)
+  target_link_tf_lite_dep_(${named_target}
+                           _deps/abseil-cpp-build/absl/debugging/
+                           absl_debugging_internal)
+  target_link_tf_lite_dep_(${named_target}
+                           _deps/abseil-cpp-build/absl/debugging/
+                           absl_demangle_internal)
+  target_link_tf_lite_dep_(${named_target}
+                           _deps/abseil-cpp-build/absl/synchronization/
+                           absl_graphcycles_internal)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/base/
+                           absl_malloc_internal)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/time/
+                           absl_time)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/strings/
+                           absl_strings)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/strings/
+                           absl_strings_internal)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/base/
+                           absl_throw_delegate)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/base/
+                           absl_base)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/base/
+                           absl_dynamic_annotations)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/base/
+                           absl_spinlock_wait)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/numeric/
+                           absl_int128)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/time/
+                           absl_civil_time)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/time/
+                           absl_time_zone)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/types/
+                           absl_bad_variant_access)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/base/
+                           absl_raw_logging_internal)
+  target_link_tf_lite_dep_(${named_target} _deps/abseil-cpp-build/absl/base/
+                           absl_log_severity)
+  target_link_tf_lite_dep_(${named_target} _deps/farmhash-build/ farmhash)
+  target_link_tf_lite_dep_(${named_target} _deps/fft2d-build/ fft2d_fftsg2d)
+  target_link_tf_lite_dep_(${named_target} _deps/fft2d-build/ fft2d_fftsg)
+  target_link_tf_lite_dep_(${named_target} _deps/flatbuffers-build/ flatbuffers)
+  target_link_tf_lite_dep_(${named_target} _deps/ruy-build/ ruy)
+endfunction()
+
+# Can Tensorflow-Lite be enabled with the current build system? Sets the
+# variable with either 0 or 1 as the value. If 0, also prints an explanatory
+# message.
+function(is_tflite_supported result)
   # Cross-compile is not currently implemented.
   if(CMAKE_TOOLCHAIN_FILE)
     message("TOOLCHAIN: ${CMAKE_TOOLCHAIN_FILE}")
     message(WARNING "No cross-compile support for TensorFlow Lite; disabling")
-    set(CONFIG_TENSORFLOW_LITE 0)
+    set(${result} 0 PARENT_SCOPE)
+    return()
   endif()
-  checkout_submodules_()
-  add_tensorflow_lite_dependency_()
+  # TF-Lite specifies a minimum CMake version of 3.16, but Jenkins uses 3.7.2.
+  # Until Jenkins is upgraded, disable TF-Lite if a lower version of CMake is
+  # detected.
+  if(${CMAKE_VERSION} VERSION_LESS "3.16")
+    message(
+      WARNING "Tensorflow Lite requres CMake version 3.16 or higher; version "
+              "${CMAKE_VERSION} detected; disabling")
+    set(${result} 0 PARENT_SCOPE)
+    return()
+  endif()
+  set(${result} 1 PARENT_SCOPE)
+endfunction()
+
+# Adds appropriate build rules / targets. Only invoke this function if
+# is_tf_lite_supported returns true.
+function(setup_tensorflow_lite)
+  if("${AOM_ROOT}" STREQUAL "")
+    message(FATAL_ERROR "AOM_ROOT variable must not be empty.")
+  endif()
+
+  if(MSVC)
+    add_compile_definitions(NOMINMAX=1)
+  endif()
+  checkout_submodule_()
+
+  # Allow code to reference TF.
+  include_directories("${AOM_ROOT}/third_party/tensorflow")
+
+  externalproject_add(
+    tensorflow_lite
+    SOURCE_DIR "${AOM_ROOT}/third_party/tensorflow/tensorflow/lite"
+    PREFIX "${CMAKE_BINARY_DIR}/tensorflow_lite"
+    BINARY_DIR "${CMAKE_BINARY_DIR}/tensorflow_lite"
+    DOWNLOAD_DIR "${CMAKE_BINARY_DIR}/tensorflow_lite"
+    LOG_BUILD 1)
+
+  # TF-Lite depends on this, and downloads it during compilation.
+  include_directories(
+    "${CMAKE_CURRENT_BINARY_DIR}/tensorflow_lite/flatbuffers/include/")
+
+  add_dependencies(aom_av1_common tensorflow_lite)
+  foreach(aom_app ${AOM_APP_TARGETS})
+    add_dependencies(${aom_app} tensorflow_lite)
+    target_link_tf_lite_libraries(${aom_app})
+  endforeach()
+endfunction()
+
+# Signal that the experiment needs TF-lite enabled.
+function(experiment_requires_tf_lite experiment_name)
+  # Experiment is not enabled, no need to include TF-Lite in the build.
+  if(NOT ${${experiment_name}})
+    return()
+  endif()
+  set(supported 0)
+  is_tflite_supported(supported)
+  if(NOT ${supported})
+    # Disable the experiment so Gerrit will not test this case.
+    message(WARNING "Disabling ${experiment_name}.")
+    set(${experiment_name} 0 PARENT_SCOPE)
+    set(CONFIG_TENSORFLOW_LITE 0 PARENT_SCOPE)
+    return()
+  endif()
+  # Otherwise, enable TF-lite.
+  set(CONFIG_TENSORFLOW_LITE 1 PARENT_SCOPE)
 endfunction()
diff --git a/third_party/tensorflow b/third_party/tensorflow
index 18445b0..ce0d0a4 160000
--- a/third_party/tensorflow
+++ b/third_party/tensorflow
@@ -1 +1 @@
-Subproject commit 18445b0e39b677a21c86b4cf3d2bcb823f27e3e2
+Subproject commit ce0d0a458c87c8c3c541d56bfbea85455e4c7003
diff --git a/third_party/tensorflow_dependencies/absl b/third_party/tensorflow_dependencies/absl
deleted file mode 160000
index 3c2bed2..0000000
--- a/third_party/tensorflow_dependencies/absl
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3c2bed2e77a8b77ce3092e3f75140dd21879bdba
diff --git a/third_party/tensorflow_dependencies/build.pl b/third_party/tensorflow_dependencies/build.pl
deleted file mode 100755
index dcf14e5..0000000
--- a/third_party/tensorflow_dependencies/build.pl
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright (c) 2020, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and
-# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-# was not distributed with this source code in the LICENSE file, you can
-# obtain it at www.aomedia.org/license/software. If the Alliance for Open
-# Media Patent License 1.0 was not distributed with this source code in the
-# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-#
-###########################################################################
-#
-# Script to build the static TensorFlow lite library.
-#
-# Tensorflow's build process generates the static libraries in the same
-# directory as the source code. AOM, however, generates binaries/objects/etc.
-# in a different directory. This script:
-#
-# 1.) Copies the TensorFlow code to a temporary directory
-# 2.) Copies the necessary dependencies into the temporary directory
-# 3.) Compiles it
-# 4.) Copies the static library to the AOM build directory
-#
-# Note that we do not use the download_dependencies.sh script directly, as
-# it downloads directly into the source directory.
-
-use strict;
-use warnings;
-use autodie;
-use Cwd;
-use File::Basename;
-use File::Copy;
-use File::Spec::Functions;
-use File::Temp;
-
-# Apply a tweak to Eigen (TF Lite's "download_dependencies.sh"
-# performs 3 tweaks, but only 1 of them is relevant).
-sub apply_eigen_tweak {
-    my $downloads_dir = $_[0];
-    my $file = catfile($downloads_dir, "eigen", "Eigen", "src", "Core",
-                       "arch", "NEON", "Complex.h");
-    open(my $fh, '<', $file);
-    my @lines = <$fh>;
-    close($fh);
-    open($fh, '>', $file);
-    foreach my $line (@lines) {
-        $line =~ s/static uint64x2_t p2ul_CONJ_XOR = vld1q_u64\( p2ul_conj_XOR_DATA \);/static uint64x2_t p2ul_CONJ_XOR;/;
-        print $fh $line;
-    }
-    close($fh);
-}
-
-# Pure Perl implementation of recursive directory-copy -
-# File::Copy::Recursive is not part of core.
-sub recursive_dircopy {
-    my $input_dir = $_[0];
-    my $output_dir = $_[1];
-    # If the output directory does not exist, create it.
-    if (!(-d $output_dir)) {
-        mkdir $output_dir;
-    }
-    opendir(my $dh, $input_dir);
-    while (my $file = readdir($dh)) {
-        # Ignore . and .. entries.
-        next if ($file eq "." or $file eq "..");
-        my $in_file = catfile($input_dir, $file);
-        my $out_file = catfile($output_dir, $file);
-        # If it is a directory, recursively call.
-        if (-d $in_file) {
-            recursive_dircopy($in_file, $out_file);
-        } else {
-          # If a file, copy over.
-          copy($in_file, $out_file);
-          # Preserve execute permission.
-          if (-x $in_file) {
-            chmod 0755, $out_file;
-          }
-        }
-    }
-    closedir($dh);
-}
-
-# Finds the first instance of the file in the directory, using a depth-first
-# search. Returns "" if unable to find.
-sub find_file {
-  my $source_dir = $_[0];
-  my $fname = $_[1];
-  opendir(my $dh, $source_dir);
-  while (my $file = readdir($dh)) {
-    # Ignore . and .. entries.
-    next if ($file eq "." or $file eq "..");
-    if (-d catfile($source_dir, $file)) {
-      my $result = find_file(catfile($source_dir, $file), $fname);
-      if ($result ne "") {
-        closedir($dh);
-        return $result;
-      }
-    } elsif ($file eq $fname) {
-      closedir($dh);
-      return catfile($source_dir, $file);
-    }
-  }
-  closedir($dh);
-  return "";
-}
-
-sub copy_tensorflow_lite_dependencies {
-    my $source_dir = $_[0];
-    my $output_dir = $_[1];
-    my $dependencies_dir = catfile($source_dir, "third_party",
-                                   "tensorflow_dependencies");
-    opendir(my $dh, $dependencies_dir);
-    while (my $file = readdir($dh)) {
-        # Ignore . and .. entries.
-        next if ($file eq "." or $file eq "..");
-        # Ignore non-directories.
-        next unless (-d catfile($dependencies_dir, $file));
-        # Copy the directory.
-        print "  * $file\n";
-        recursive_dircopy(
-          catfile($dependencies_dir, $file),
-          catfile($output_dir, $file));
-    }
-    closedir($dh);
-}
-
-# Start of program logic.
-if ($#ARGV + 1 != 2) {
-    my $prog = basename($0);
-    warn("Usage: $prog <source directory> <output directory>\n");
-    exit(1);
-}
-my ($source_dir, $output_dir) = @ARGV;
-
-my $temp_dir = File::Temp::tempdir( CLEANUP => 1 );
-
-print "Copying TensorFlow code to temporary directory for building...\n";
-my $tf_dir = catfile($source_dir, "third_party", "tensorflow");
-recursive_dircopy($tf_dir, $temp_dir);
-
-my $downloads_dir = catfile($temp_dir, "tensorflow", "lite", "tools",
-                            "make", "downloads");
-mkdir $downloads_dir;
-print "Copying TensorFlow Lite dependencies to temporary directory...\n";
-copy_tensorflow_lite_dependencies($source_dir, $downloads_dir);
-apply_eigen_tweak($downloads_dir);
-
-print "Building TensorFlow Lite...\n";
-my $build_sh = catfile($temp_dir, "tensorflow", "lite", "tools", "make",
-                       "build_lib.sh");
-`$build_sh`;
-# Find the libtensorflow-lite.a file, which is generated under
-# tensorflow/lite/tools/make/gen (but in a different sub-directory depending
-# on the architecture).
-my $gen_dir = catfile($temp_dir, "tensorflow", "lite", "tools", "make", "gen");
-my $liblite = find_file($gen_dir, "libtensorflow-lite.a");
-$liblite ne "" or die("Unable to find libtensorflow-lite.a");
-print "Copying static library into build directory...\n";
-copy($liblite, $output_dir);
diff --git a/third_party/tensorflow_dependencies/eigen b/third_party/tensorflow_dependencies/eigen
deleted file mode 160000
index b11f817..0000000
--- a/third_party/tensorflow_dependencies/eigen
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b11f817bcff04276f3024d6780f56a137968b81a
diff --git a/third_party/tensorflow_dependencies/farmhash b/third_party/tensorflow_dependencies/farmhash
deleted file mode 160000
index 0d859a8..0000000
--- a/third_party/tensorflow_dependencies/farmhash
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0d859a811870d10f53a594927d0d0b97573ad06d
diff --git a/third_party/tensorflow_dependencies/fft2d/alloc.c b/third_party/tensorflow_dependencies/fft2d/alloc.c
deleted file mode 100644
index 7833d88..0000000
--- a/third_party/tensorflow_dependencies/fft2d/alloc.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/* ---- memory allocation ---- */
-#include "alloc.h"
-
-
-#define alloc_error_check(p) { \
-    if ((p) == NULL) { \
-        fprintf(stderr, "Allocation Failure!\n"); \
-        exit(1); \
-    } \
-}
-
-
-int *alloc_1d_int(int n1)
-{
-    int *i;
-    
-    i = (int *) malloc(sizeof(int) * n1);
-    alloc_error_check(i);
-    return i;
-}
-
-
-void free_1d_int(int *i)
-{
-    free(i);
-}
-
-
-double *alloc_1d_double(int n1)
-{
-    double *d;
-    
-    d = (double *) malloc(sizeof(double) * n1);
-    alloc_error_check(d);
-    return d;
-}
-
-
-void free_1d_double(double *d)
-{
-    free(d);
-}
-
-
-int **alloc_2d_int(int n1, int n2)
-{
-    int **ii, *i;
-    int j;
-    
-    ii = (int **) malloc(sizeof(int *) * n1);
-    alloc_error_check(ii);
-    i = (int *) malloc(sizeof(int) * n1 * n2);
-    alloc_error_check(i);
-    ii[0] = i;
-    for (j = 1; j < n1; j++) {
-        ii[j] = ii[j - 1] + n2;
-    }
-    return ii;
-}
-
-
-void free_2d_int(int **ii)
-{
-    free(ii[0]);
-    free(ii);
-}
-
-
-double **alloc_2d_double(int n1, int n2)
-{
-    double **dd, *d;
-    int j;
-    
-    dd = (double **) malloc(sizeof(double *) * n1);
-    alloc_error_check(dd);
-    d = (double *) malloc(sizeof(double) * n1 * n2);
-    alloc_error_check(d);
-    dd[0] = d;
-    for (j = 1; j < n1; j++) {
-        dd[j] = dd[j - 1] + n2;
-    }
-    return dd;
-}
-
-
-void free_2d_double(double **dd)
-{
-    free(dd[0]);
-    free(dd);
-}
-
-
-int ***alloc_3d_int(int n1, int n2, int n3)
-{
-    int ***iii, **ii, *i;
-    int j;
-    
-    iii = (int ***) malloc(sizeof(int **) * n1);
-    alloc_error_check(iii);
-    ii = (int **) malloc(sizeof(int *) * n1 * n2);
-    alloc_error_check(ii);
-    iii[0] = ii;
-    for (j = 1; j < n1; j++) {
-        iii[j] = iii[j - 1] + n2;
-    }
-    i = (int *) malloc(sizeof(int) * n1 * n2 * n3);
-    alloc_error_check(i);
-    ii[0] = i;
-    for (j = 1; j < n1 * n2; j++) {
-        ii[j] = ii[j - 1] + n3;
-    }
-    return iii;
-}
-
-
-void free_3d_int(int ***iii)
-{
-    free(iii[0][0]);
-    free(iii[0]);
-    free(iii);
-}
-
-
-double ***alloc_3d_double(int n1, int n2, int n3)
-{
-    double ***ddd, **dd, *d;
-    int j;
-    
-    ddd = (double ***) malloc(sizeof(double **) * n1);
-    alloc_error_check(ddd);
-    dd = (double **) malloc(sizeof(double *) * n1 * n2);
-    alloc_error_check(dd);
-    ddd[0] = dd;
-    for (j = 1; j < n1; j++) {
-        ddd[j] = ddd[j - 1] + n2;
-    }
-    d = (double *) malloc(sizeof(double) * n1 * n2 * n3);
-    alloc_error_check(d);
-    dd[0] = d;
-    for (j = 1; j < n1 * n2; j++) {
-        dd[j] = dd[j - 1] + n3;
-    }
-    return ddd;
-}
-
-
-void free_3d_double(double ***ddd)
-{
-    free(ddd[0][0]);
-    free(ddd[0]);
-    free(ddd);
-}
-
diff --git a/third_party/tensorflow_dependencies/fft2d/alloc.h b/third_party/tensorflow_dependencies/fft2d/alloc.h
deleted file mode 100644
index 3467cc4..0000000
--- a/third_party/tensorflow_dependencies/fft2d/alloc.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* ---- memory allocation ---- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-
-
-int *alloc_1d_int(int n1);
-void free_1d_int(int *i);
-double *alloc_1d_double(int n1);
-void free_1d_double(double *d);
-int **alloc_2d_int(int n1, int n2);
-void free_2d_int(int **ii);
-double **alloc_2d_double(int n1, int n2);
-void free_2d_double(double **dd);
-int ***alloc_3d_int(int n1, int n2, int n3);
-void free_3d_int(int ***iii);
-double ***alloc_3d_double(int n1, int n2, int n3);
-void free_3d_double(double ***ddd);
-
diff --git a/third_party/tensorflow_dependencies/fft2d/fft4f2d.c b/third_party/tensorflow_dependencies/fft2d/fft4f2d.c
deleted file mode 100644
index 354c80e..0000000
--- a/third_party/tensorflow_dependencies/fft2d/fft4f2d.c
+++ /dev/null
@@ -1,1705 +0,0 @@
-/*
-Fast Fourier/Cosine/Sine Transform
-    dimension   :two
-    data length :power of 2
-    decimation  :frequency
-    radix       :4, 2, row-column
-    data        :inplace
-    table       :use
-functions
-    cdft2d: Complex Discrete Fourier Transform
-    rdft2d: Real Discrete Fourier Transform
-    ddct2d: Discrete Cosine Transform
-    ddst2d: Discrete Sine Transform
-function prototypes
-    void cdft2d(int, int, int, double **, int *, double *);
-    void rdft2d(int, int, int, double **, int *, double *);
-    void ddct2d(int, int, int, double **, double **, int *, double *);
-    void ddst2d(int, int, int, double **, double **, int *, double *);
-
-
--------- Complex DFT (Discrete Fourier Transform) --------
-    [definition]
-        <case1>
-            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * 
-                            exp(2*pi*i*j1*k1/n1) * 
-                            exp(2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
-        <case2>
-            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * 
-                            exp(-2*pi*i*j1*k1/n1) * 
-                            exp(-2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
-        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            cdft2d(n1, 2*n2, 1, a, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            cdft2d(n1, 2*n2, -1, a, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 1, n1 = power of 2
-        2*n2   :data length (int)
-                n2 >= 1, n2 = power of 2
-        a[0...n1-1][0...2*n2-1]
-               :input/output data (double **)
-                input data
-                    a[j1][2*j2] = Re(x[j1][j2]), 
-                    a[j1][2*j2+1] = Im(x[j1][j2]), 
-                    0<=j1<n1, 0<=j2<n2
-                output data
-                    a[k1][2*k2] = Re(X[k1][k2]), 
-                    a[k1][2*k2+1] = Im(X[k1][k2]), 
-                    0<=k1<n1, 0<=k2<n2
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1, n2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1/2, n2/2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            cdft2d(n1, 2*n2, -1, a, ip, w);
-        is 
-            cdft2d(n1, 2*n2, 1, a, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= 2 * n2 - 1; j2++) {
-                    a[j1][j2] *= 1.0 / (n1 * n2);
-                }
-            }
-        .
-
-
--------- Real DFT / Inverse of Real DFT --------
-    [definition]
-        <case1> RDFT
-            R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
-                            0<=k1<n1, 0<=k2<n2
-            I[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
-                            0<=k1<n1, 0<=k2<n2
-        <case2> IRDFT (excluding scale)
-            a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
-                            (R[j1][j2] * 
-                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + 
-                            I[j1][j2] * 
-                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 
-                            0<=k1<n1, 0<=k2<n2
-        (notes: R[n1-k1][n2-k2] = R[k1][k2], 
-                I[n1-k1][n2-k2] = -I[k1][k2], 
-                R[n1-k1][0] = R[k1][0], 
-                I[n1-k1][0] = -I[k1][0], 
-                R[0][n2-k2] = R[0][k2], 
-                I[0][n2-k2] = -I[0][k2], 
-                0<k1<n1, 0<k2<n2)
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            rdft2d(n1, n2, 1, a, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            rdft2d(n1, n2, -1, a, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        a[0...n1-1][0...n2-1]
-               :input/output data (double **)
-                <case1>
-                    output data
-                        a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2], 
-                        a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2], 
-                            0<k1<n1, 0<k2<n2/2, 
-                        a[0][2*k2] = R[0][k2] = R[0][n2-k2], 
-                        a[0][2*k2+1] = I[0][k2] = -I[0][n2-k2], 
-                            0<k2<n2/2, 
-                        a[k1][0] = R[k1][0] = R[n1-k1][0], 
-                        a[k1][1] = I[k1][0] = -I[n1-k1][0], 
-                        a[n1-k1][1] = R[k1][n2/2] = R[n1-k1][n2/2], 
-                        a[n1-k1][0] = -I[k1][n2/2] = I[n1-k1][n2/2], 
-                            0<k1<n1/2, 
-                        a[0][0] = R[0][0], 
-                        a[0][1] = R[0][n2/2], 
-                        a[n1/2][0] = R[n1/2][0], 
-                        a[n1/2][1] = R[n1/2][n2/2]
-                <case2>
-                    input data
-                        a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2], 
-                        a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2], 
-                            0<j1<n1, 0<j2<n2/2, 
-                        a[0][2*j2] = R[0][j2] = R[0][n2-j2], 
-                        a[0][2*j2+1] = I[0][j2] = -I[0][n2-j2], 
-                            0<j2<n2/2, 
-                        a[j1][0] = R[j1][0] = R[n1-j1][0], 
-                        a[j1][1] = I[j1][0] = -I[n1-j1][0], 
-                        a[n1-j1][1] = R[j1][n2/2] = R[n1-j1][n2/2], 
-                        a[n1-j1][0] = -I[j1][n2/2] = I[n1-j1][n2/2], 
-                            0<j1<n1/2, 
-                        a[0][0] = R[0][0], 
-                        a[0][1] = R[0][n2/2], 
-                        a[n1/2][0] = R[n1/2][0], 
-                        a[n1/2][1] = R[n1/2][n2/2]
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1, n2/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1/2, n2/4) + n2/4
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            rdft2d(n1, n2, 1, a, ip, w);
-        is 
-            rdft2d(n1, n2, -1, a, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    a[j1][j2] *= 2.0 / (n1 * n2);
-                }
-            }
-        .
-
-
--------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
-    [definition]
-        <case1> IDCT (excluding scale)
-            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            cos(pi*j1*(k1+1/2)/n1) * 
-                            cos(pi*j2*(k2+1/2)/n2), 
-                            0<=k1<n1, 0<=k2<n2
-        <case2> DCT
-            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            cos(pi*(j1+1/2)*k1/n1) * 
-                            cos(pi*(j2+1/2)*k2/n2), 
-                            0<=k1<n1, 0<=k2<n2
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            ddct2d(n1, n2, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            ddct2d(n1, n2, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        a[0...n1-1][0...n2-1]
-               :input/output data (double **)
-                output data
-                    a[k1][k2] = C[k1][k2], 0<=k1<n1, 0<=k2<n2
-        t[0...n1-1][0...n2-1]
-               :work area (double **)
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1, n2/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1/2, n2/4) + max(n1, n2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            ddct2d(n1, n2, -1, a, t, ip, w);
-        is 
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                a[j1][0] *= 0.5;
-            }
-            for (j2 = 0; j2 <= n2 - 1; j2++) {
-                a[0][j2] *= 0.5;
-            }
-            ddct2d(n1, n2, 1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    a[j1][j2] *= 4.0 / (n1 * n2);
-                }
-            }
-        .
-
-
--------- DST (Discrete Sine Transform) / Inverse of DST --------
-    [definition]
-        <case1> IDST (excluding scale)
-            S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] * 
-                            sin(pi*j1*(k1+1/2)/n1) * 
-                            sin(pi*j2*(k2+1/2)/n2), 
-                            0<=k1<n1, 0<=k2<n2
-        <case2> DST
-            S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            sin(pi*(j1+1/2)*k1/n1) * 
-                            sin(pi*(j2+1/2)*k2/n2), 
-                            0<k1<=n1, 0<k2<=n2
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            ddst2d(n1, n2, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            ddst2d(n1, n2, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        a[0...n1-1][0...n2-1]
-               :input/output data (double **)
-                <case1>
-                    input data
-                        a[j1][j2] = A[j1][j2], 0<j1<n1, 0<j2<n2, 
-                        a[j1][0] = A[j1][n2], 0<j1<n1, 
-                        a[0][j2] = A[n1][j2], 0<j2<n2, 
-                        a[0][0] = A[n1][n2]
-                        (i.e. A[j1][j2] = a[j1 % n1][j2 % n2])
-                    output data
-                        a[k1][k2] = S[k1][k2], 0<=k1<n1, 0<=k2<n2
-                <case2>
-                    output data
-                        a[k1][k2] = S[k1][k2], 0<k1<n1, 0<k2<n2, 
-                        a[k1][0] = S[k1][n2], 0<k1<n1, 
-                        a[0][k2] = S[n1][k2], 0<k2<n2, 
-                        a[0][0] = S[n1][n2]
-                        (i.e. S[k1][k2] = a[k1 % n1][k2 % n2])
-        t[0...n1-1][0...n2-1]
-               :work area (double **)
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1, n2/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1/2, n2/4) + max(n1, n2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            ddst2d(n1, n2, -1, a, t, ip, w);
-        is 
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                a[j1][0] *= 0.5;
-            }
-            for (j2 = 0; j2 <= n2 - 1; j2++) {
-                a[0][j2] *= 0.5;
-            }
-            ddst2d(n1, n2, 1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    a[j1][j2] *= 4.0 / (n1 * n2);
-                }
-            }
-        .
-*/
-
-
-void cdft2d(int n1, int n2, int isgn, double **a, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void bitrv2col(int n1, int n, int *ip, double **a);
-    void bitrv2row(int n, int n2, int *ip, double **a);
-    void cftbcol(int n1, int n, double **a, double *w);
-    void cftbrow(int n, int n2, double **a, double *w);
-    void cftfcol(int n1, int n, double **a, double *w);
-    void cftfrow(int n, int n2, double **a, double *w);
-    int n;
-    
-    n = n1 << 1;
-    if (n < n2) {
-        n = n2;
-    }
-    if (n > (ip[0] << 2)) {
-        makewt(n >> 2, ip, w);
-    }
-    if (n2 > 4) {
-        bitrv2col(n1, n2, ip + 2, a);
-    }
-    if (n1 > 2) {
-        bitrv2row(n1, n2, ip + 2, a);
-    }
-    if (isgn < 0) {
-        cftfcol(n1, n2, a, w);
-        cftfrow(n1, n2, a, w);
-    } else {
-        cftbcol(n1, n2, a, w);
-        cftbrow(n1, n2, a, w);
-    }
-}
-
-
-void rdft2d(int n1, int n2, int isgn, double **a, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void bitrv2col(int n1, int n, int *ip, double **a);
-    void bitrv2row(int n, int n2, int *ip, double **a);
-    void cftbcol(int n1, int n, double **a, double *w);
-    void cftbrow(int n, int n2, double **a, double *w);
-    void cftfcol(int n1, int n, double **a, double *w);
-    void cftfrow(int n, int n2, double **a, double *w);
-    void rftbcol(int n1, int n, double **a, int nc, double *c);
-    void rftfcol(int n1, int n, double **a, int nc, double *c);
-    int n, nw, nc, n1h, i, j;
-    double xi;
-    
-    n = n1 << 1;
-    if (n < n2) {
-        n = n2;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n2 > (nc << 2)) {
-        nc = n2 >> 2;
-        makect(nc, ip, w + nw);
-    }
-    n1h = n1 >> 1;
-    if (isgn < 0) {
-        for (i = 1; i <= n1h - 1; i++) {
-            j = n1 - i;
-            xi = a[i][0] - a[j][0];
-            a[i][0] += a[j][0];
-            a[j][0] = xi;
-            xi = a[j][1] - a[i][1];
-            a[i][1] += a[j][1];
-            a[j][1] = xi;
-        }
-        if (n1 > 2) {
-            bitrv2row(n1, n2, ip + 2, a);
-        }
-        cftfrow(n1, n2, a, w);
-        for (i = 0; i <= n1 - 1; i++) {
-            a[i][1] = 0.5 * (a[i][0] - a[i][1]);
-            a[i][0] -= a[i][1];
-        }
-        if (n2 > 4) {
-            rftfcol(n1, n2, a, nc, w + nw);
-            bitrv2col(n1, n2, ip + 2, a);
-        }
-        cftfcol(n1, n2, a, w);
-    } else {
-        if (n2 > 4) {
-            bitrv2col(n1, n2, ip + 2, a);
-        }
-        cftbcol(n1, n2, a, w);
-        if (n2 > 4) {
-            rftbcol(n1, n2, a, nc, w + nw);
-        }
-        for (i = 0; i <= n1 - 1; i++) {
-            xi = a[i][0] - a[i][1];
-            a[i][0] += a[i][1];
-            a[i][1] = xi;
-        }
-        if (n1 > 2) {
-            bitrv2row(n1, n2, ip + 2, a);
-        }
-        cftbrow(n1, n2, a, w);
-        for (i = 1; i <= n1h - 1; i++) {
-            j = n1 - i;
-            a[j][0] = 0.5 * (a[i][0] - a[j][0]);
-            a[i][0] -= a[j][0];
-            a[j][1] = 0.5 * (a[i][1] + a[j][1]);
-            a[i][1] -= a[j][1];
-        }
-    }
-}
-
-
-void ddct2d(int n1, int n2, int isgn, double **a, double **t, 
-    int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void bitrv2col(int n1, int n, int *ip, double **a);
-    void bitrv2row(int n, int n2, int *ip, double **a);
-    void cftbcol(int n1, int n, double **a, double *w);
-    void cftbrow(int n, int n2, double **a, double *w);
-    void cftfcol(int n1, int n, double **a, double *w);
-    void cftfrow(int n, int n2, double **a, double *w);
-    void rftbcol(int n1, int n, double **a, int nc, double *c);
-    void rftfcol(int n1, int n, double **a, int nc, double *c);
-    void dctbsub(int n1, int n2, double **a, int nc, double *c);
-    void dctfsub(int n1, int n2, double **a, int nc, double *c);
-    int n, nw, nc, n1h, n2h, i, ix, ic, j, jx, jc;
-    double xi;
-    
-    n = n1 << 1;
-    if (n < n2) {
-        n = n2;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n1 > nc || n2 > nc) {
-        if (n1 > n2) {
-            nc = n1;
-        } else {
-            nc = n2;
-        }
-        makect(nc, ip, w + nw);
-    }
-    n1h = n1 >> 1;
-    n2h = n2 >> 1;
-    if (isgn >= 0) {
-        for (i = 0; i <= n1 - 1; i++) {
-            for (j = 1; j <= n2h - 1; j++) {
-                jx = j << 1;
-                t[i][jx] = a[i][j];
-                t[i][jx + 1] = a[i][n2 - j];
-            }
-        }
-        t[0][0] = a[0][0];
-        t[0][1] = a[0][n2h];
-        t[n1h][0] = a[n1h][0];
-        t[n1h][1] = a[n1h][n2h];
-        for (i = 1; i <= n1h - 1; i++) {
-            ic = n1 - i;
-            t[i][0] = a[i][0];
-            t[ic][1] = a[i][n2h];
-            t[i][1] = a[ic][0];
-            t[ic][0] = a[ic][n2h];
-        }
-        dctfsub(n1, n2, t, nc, w + nw);
-        if (n1 > 2) {
-            bitrv2row(n1, n2, ip + 2, t);
-        }
-        cftfrow(n1, n2, t, w);
-        for (i = 0; i <= n1 - 1; i++) {
-            t[i][1] = 0.5 * (t[i][0] - t[i][1]);
-            t[i][0] -= t[i][1];
-        }
-        if (n2 > 4) {
-            rftfcol(n1, n2, t, nc, w + nw);
-            bitrv2col(n1, n2, ip + 2, t);
-        }
-        cftfcol(n1, n2, t, w);
-        for (i = 0; i <= n1h - 1; i++) {
-            ix = i << 1;
-            ic = n1 - 1 - i;
-            for (j = 0; j <= n2h - 1; j++) {
-                jx = j << 1;
-                jc = n2 - 1 - j;
-                a[ix][jx] = t[i][j];
-                a[ix][jx + 1] = t[i][jc];
-                a[ix + 1][jx] = t[ic][j];
-                a[ix + 1][jx + 1] = t[ic][jc];
-            }
-        }
-    } else {
-        for (i = 0; i <= n1h - 1; i++) {
-            ix = i << 1;
-            ic = n1 - 1 - i;
-            for (j = 0; j <= n2h - 1; j++) {
-                jx = j << 1;
-                jc = n2 - 1 - j;
-                t[i][j] = a[ix][jx];
-                t[i][jc] = a[ix][jx + 1];
-                t[ic][j] = a[ix + 1][jx];
-                t[ic][jc] = a[ix + 1][jx + 1];
-            }
-        }
-        if (n2 > 4) {
-            bitrv2col(n1, n2, ip + 2, t);
-        }
-        cftbcol(n1, n2, t, w);
-        if (n2 > 4) {
-            rftbcol(n1, n2, t, nc, w + nw);
-        }
-        for (i = 0; i <= n1 - 1; i++) {
-            xi = t[i][0] - t[i][1];
-            t[i][0] += t[i][1];
-            t[i][1] = xi;
-        }
-        if (n1 > 2) {
-            bitrv2row(n1, n2, ip + 2, t);
-        }
-        cftbrow(n1, n2, t, w);
-        dctbsub(n1, n2, t, nc, w + nw);
-        for (i = 0; i <= n1 - 1; i++) {
-            for (j = 1; j <= n2h - 1; j++) {
-                jx = j << 1;
-                a[i][j] = t[i][jx];
-                a[i][n2 - j] = t[i][jx + 1];
-            }
-        }
-        a[0][0] = t[0][0];
-        a[0][n2h] = t[0][1];
-        a[n1h][0] = t[n1h][0];
-        a[n1h][n2h] = t[n1h][1];
-        for (i = 1; i <= n1h - 1; i++) {
-            ic = n1 - i;
-            a[i][0] = t[i][0];
-            a[i][n2h] = t[ic][1];
-            a[ic][0] = t[i][1];
-            a[ic][n2h] = t[ic][0];
-        }
-    }
-}
-
-
-void ddst2d(int n1, int n2, int isgn, double **a, double **t, 
-    int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void bitrv2col(int n1, int n, int *ip, double **a);
-    void bitrv2row(int n, int n2, int *ip, double **a);
-    void cftbcol(int n1, int n, double **a, double *w);
-    void cftbrow(int n, int n2, double **a, double *w);
-    void cftfcol(int n1, int n, double **a, double *w);
-    void cftfrow(int n, int n2, double **a, double *w);
-    void rftbcol(int n1, int n, double **a, int nc, double *c);
-    void rftfcol(int n1, int n, double **a, int nc, double *c);
-    void dstbsub(int n1, int n2, double **a, int nc, double *c);
-    void dstfsub(int n1, int n2, double **a, int nc, double *c);
-    int n, nw, nc, n1h, n2h, i, ix, ic, j, jx, jc;
-    double xi;
-    
-    n = n1 << 1;
-    if (n < n2) {
-        n = n2;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n1 > nc || n2 > nc) {
-        if (n1 > n2) {
-            nc = n1;
-        } else {
-            nc = n2;
-        }
-        makect(nc, ip, w + nw);
-    }
-    n1h = n1 >> 1;
-    n2h = n2 >> 1;
-    if (isgn >= 0) {
-        for (i = 0; i <= n1 - 1; i++) {
-            for (j = 1; j <= n2h - 1; j++) {
-                jx = j << 1;
-                t[i][jx] = a[i][j];
-                t[i][jx + 1] = a[i][n2 - j];
-            }
-        }
-        t[0][0] = a[0][0];
-        t[0][1] = a[0][n2h];
-        t[n1h][0] = a[n1h][0];
-        t[n1h][1] = a[n1h][n2h];
-        for (i = 1; i <= n1h - 1; i++) {
-            ic = n1 - i;
-            t[i][0] = a[i][0];
-            t[ic][1] = a[i][n2h];
-            t[i][1] = a[ic][0];
-            t[ic][0] = a[ic][n2h];
-        }
-        dstfsub(n1, n2, t, nc, w + nw);
-        if (n1 > 2) {
-            bitrv2row(n1, n2, ip + 2, t);
-        }
-        cftfrow(n1, n2, t, w);
-        for (i = 0; i <= n1 - 1; i++) {
-            t[i][1] = 0.5 * (t[i][0] - t[i][1]);
-            t[i][0] -= t[i][1];
-        }
-        if (n2 > 4) {
-            rftfcol(n1, n2, t, nc, w + nw);
-            bitrv2col(n1, n2, ip + 2, t);
-        }
-        cftfcol(n1, n2, t, w);
-        for (i = 0; i <= n1h - 1; i++) {
-            ix = i << 1;
-            ic = n1 - 1 - i;
-            for (j = 0; j <= n2h - 1; j++) {
-                jx = j << 1;
-                jc = n2 - 1 - j;
-                a[ix][jx] = t[i][j];
-                a[ix][jx + 1] = -t[i][jc];
-                a[ix + 1][jx] = -t[ic][j];
-                a[ix + 1][jx + 1] = t[ic][jc];
-            }
-        }
-    } else {
-        for (i = 0; i <= n1h - 1; i++) {
-            ix = i << 1;
-            ic = n1 - 1 - i;
-            for (j = 0; j <= n2h - 1; j++) {
-                jx = j << 1;
-                jc = n2 - 1 - j;
-                t[i][j] = a[ix][jx];
-                t[i][jc] = -a[ix][jx + 1];
-                t[ic][j] = -a[ix + 1][jx];
-                t[ic][jc] = a[ix + 1][jx + 1];
-            }
-        }
-        if (n2 > 4) {
-            bitrv2col(n1, n2, ip + 2, t);
-        }
-        cftbcol(n1, n2, t, w);
-        if (n2 > 4) {
-            rftbcol(n1, n2, t, nc, w + nw);
-        }
-        for (i = 0; i <= n1 - 1; i++) {
-            xi = t[i][0] - t[i][1];
-            t[i][0] += t[i][1];
-            t[i][1] = xi;
-        }
-        if (n1 > 2) {
-            bitrv2row(n1, n2, ip + 2, t);
-        }
-        cftbrow(n1, n2, t, w);
-        dstbsub(n1, n2, t, nc, w + nw);
-        for (i = 0; i <= n1 - 1; i++) {
-            for (j = 1; j <= n2h - 1; j++) {
-                jx = j << 1;
-                a[i][j] = t[i][jx];
-                a[i][n2 - j] = t[i][jx + 1];
-            }
-        }
-        a[0][0] = t[0][0];
-        a[0][n2h] = t[0][1];
-        a[n1h][0] = t[n1h][0];
-        a[n1h][n2h] = t[n1h][1];
-        for (i = 1; i <= n1h - 1; i++) {
-            ic = n1 - i;
-            a[i][0] = t[i][0];
-            a[i][n2h] = t[ic][1];
-            a[ic][0] = t[i][1];
-            a[ic][n2h] = t[ic][0];
-        }
-    }
-}
-
-
-/* -------- initializing routines -------- */
-
-
-#include <math.h>
-
-void makewt(int nw, int *ip, double *w)
-{
-    void bitrv2(int n, int *ip, double *a);
-    int nwh, j;
-    double delta, x, y;
-    
-    ip[0] = nw;
-    ip[1] = 1;
-    if (nw > 2) {
-        nwh = nw >> 1;
-        delta = atan(1.0) / nwh;
-        w[0] = 1;
-        w[1] = 0;
-        w[nwh] = cos(delta * nwh);
-        w[nwh + 1] = w[nwh];
-        for (j = 2; j <= nwh - 2; j += 2) {
-            x = cos(delta * j);
-            y = sin(delta * j);
-            w[j] = x;
-            w[j + 1] = y;
-            w[nw - j] = y;
-            w[nw - j + 1] = x;
-        }
-        bitrv2(nw, ip + 2, w);
-    }
-}
-
-
-void makect(int nc, int *ip, double *c)
-{
-    int nch, j;
-    double delta;
-    
-    ip[1] = nc;
-    if (nc > 1) {
-        nch = nc >> 1;
-        delta = atan(1.0) / nch;
-        c[0] = 0.5;
-        c[nch] = 0.5 * cos(delta * nch);
-        for (j = 1; j <= nch - 1; j++) {
-            c[j] = 0.5 * cos(delta * j);
-            c[nc - j] = 0.5 * sin(delta * j);
-        }
-    }
-}
-
-
-/* -------- child routines -------- */
-
-
-void bitrv2(int n, int *ip, double *a)
-{
-    int j, j1, k, k1, l, m, m2;
-    double xr, xi;
-    
-    ip[0] = 0;
-    l = n;
-    m = 1;
-    while ((m << 2) < l) {
-        l >>= 1;
-        for (j = 0; j <= m - 1; j++) {
-            ip[m + j] = ip[j] + l;
-        }
-        m <<= 1;
-    }
-    if ((m << 2) > l) {
-        for (k = 1; k <= m - 1; k++) {
-            for (j = 0; j <= k - 1; j++) {
-                j1 = (j << 1) + ip[k];
-                k1 = (k << 1) + ip[j];
-                xr = a[j1];
-                xi = a[j1 + 1];
-                a[j1] = a[k1];
-                a[j1 + 1] = a[k1 + 1];
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-            }
-        }
-    } else {
-        m2 = m << 1;
-        for (k = 1; k <= m - 1; k++) {
-            for (j = 0; j <= k - 1; j++) {
-                j1 = (j << 1) + ip[k];
-                k1 = (k << 1) + ip[j];
-                xr = a[j1];
-                xi = a[j1 + 1];
-                a[j1] = a[k1];
-                a[j1 + 1] = a[k1 + 1];
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += m2;
-                k1 += m2;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                a[j1] = a[k1];
-                a[j1 + 1] = a[k1 + 1];
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-            }
-        }
-    }
-}
-
-
-void bitrv2col(int n1, int n, int *ip, double **a)
-{
-    int i, j, j1, k, k1, l, m, m2;
-    double xr, xi;
-    
-    ip[0] = 0;
-    l = n;
-    m = 1;
-    while ((m << 2) < l) {
-        l >>= 1;
-        for (j = 0; j <= m - 1; j++) {
-            ip[m + j] = ip[j] + l;
-        }
-        m <<= 1;
-    }
-    if ((m << 2) > l) {
-        for (i = 0; i <= n1 - 1; i++) {
-            for (k = 1; k <= m - 1; k++) {
-                for (j = 0; j <= k - 1; j++) {
-                    j1 = (j << 1) + ip[k];
-                    k1 = (k << 1) + ip[j];
-                    xr = a[i][j1];
-                    xi = a[i][j1 + 1];
-                    a[i][j1] = a[i][k1];
-                    a[i][j1 + 1] = a[i][k1 + 1];
-                    a[i][k1] = xr;
-                    a[i][k1 + 1] = xi;
-                }
-            }
-        }
-    } else {
-        m2 = m << 1;
-        for (i = 0; i <= n1 - 1; i++) {
-            for (k = 1; k <= m - 1; k++) {
-                for (j = 0; j <= k - 1; j++) {
-                    j1 = (j << 1) + ip[k];
-                    k1 = (k << 1) + ip[j];
-                    xr = a[i][j1];
-                    xi = a[i][j1 + 1];
-                    a[i][j1] = a[i][k1];
-                    a[i][j1 + 1] = a[i][k1 + 1];
-                    a[i][k1] = xr;
-                    a[i][k1 + 1] = xi;
-                    j1 += m2;
-                    k1 += m2;
-                    xr = a[i][j1];
-                    xi = a[i][j1 + 1];
-                    a[i][j1] = a[i][k1];
-                    a[i][j1 + 1] = a[i][k1 + 1];
-                    a[i][k1] = xr;
-                    a[i][k1 + 1] = xi;
-                }
-            }
-        }
-    }
-}
-
-
-void bitrv2row(int n, int n2, int *ip, double **a)
-{
-    int i, j, j1, k, k1, l, m;
-    double xr, xi;
-    
-    ip[0] = 0;
-    l = n;
-    m = 1;
-    while ((m << 1) < l) {
-        l >>= 1;
-        for (j = 0; j <= m - 1; j++) {
-            ip[m + j] = ip[j] + l;
-        }
-        m <<= 1;
-    }
-    if ((m << 1) > l) {
-        for (k = 1; k <= m - 1; k++) {
-            for (j = 0; j <= k - 1; j++) {
-                j1 = j + ip[k];
-                k1 = k + ip[j];
-                for (i = 0; i <= n2 - 2; i += 2) {
-                    xr = a[j1][i];
-                    xi = a[j1][i + 1];
-                    a[j1][i] = a[k1][i];
-                    a[j1][i + 1] = a[k1][i + 1];
-                    a[k1][i] = xr;
-                    a[k1][i + 1] = xi;
-                }
-            }
-        }
-    } else {
-        for (k = 1; k <= m - 1; k++) {
-            for (j = 0; j <= k - 1; j++) {
-                j1 = j + ip[k];
-                k1 = k + ip[j];
-                for (i = 0; i <= n2 - 2; i += 2) {
-                    xr = a[j1][i];
-                    xi = a[j1][i + 1];
-                    a[j1][i] = a[k1][i];
-                    a[j1][i + 1] = a[k1][i + 1];
-                    a[k1][i] = xr;
-                    a[k1][i + 1] = xi;
-                }
-                j1 += m;
-                k1 += m;
-                for (i = 0; i <= n2 - 2; i += 2) {
-                    xr = a[j1][i];
-                    xi = a[j1][i + 1];
-                    a[j1][i] = a[k1][i];
-                    a[j1][i + 1] = a[k1][i + 1];
-                    a[k1][i] = xr;
-                    a[k1][i + 1] = xi;
-                }
-            }
-        }
-    }
-}
-
-
-void cftbcol(int n1, int n, double **a, double *w)
-{
-    int i, j, j1, j2, j3, k, k1, ks, l, m;
-    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    
-    for (i = 0; i <= n1 - 1; i++) {
-        l = 2;
-        while ((l << 1) < n) {
-            m = l << 2;
-            for (j = 0; j <= l - 2; j += 2) {
-                j1 = j + l;
-                j2 = j1 + l;
-                j3 = j2 + l;
-                x0r = a[i][j] + a[i][j1];
-                x0i = a[i][j + 1] + a[i][j1 + 1];
-                x1r = a[i][j] - a[i][j1];
-                x1i = a[i][j + 1] - a[i][j1 + 1];
-                x2r = a[i][j2] + a[i][j3];
-                x2i = a[i][j2 + 1] + a[i][j3 + 1];
-                x3r = a[i][j2] - a[i][j3];
-                x3i = a[i][j2 + 1] - a[i][j3 + 1];
-                a[i][j] = x0r + x2r;
-                a[i][j + 1] = x0i + x2i;
-                a[i][j2] = x0r - x2r;
-                a[i][j2 + 1] = x0i - x2i;
-                a[i][j1] = x1r - x3i;
-                a[i][j1 + 1] = x1i + x3r;
-                a[i][j3] = x1r + x3i;
-                a[i][j3 + 1] = x1i - x3r;
-            }
-            if (m < n) {
-                wk1r = w[2];
-                for (j = m; j <= l + m - 2; j += 2) {
-                    j1 = j + l;
-                    j2 = j1 + l;
-                    j3 = j2 + l;
-                    x0r = a[i][j] + a[i][j1];
-                    x0i = a[i][j + 1] + a[i][j1 + 1];
-                    x1r = a[i][j] - a[i][j1];
-                    x1i = a[i][j + 1] - a[i][j1 + 1];
-                    x2r = a[i][j2] + a[i][j3];
-                    x2i = a[i][j2 + 1] + a[i][j3 + 1];
-                    x3r = a[i][j2] - a[i][j3];
-                    x3i = a[i][j2 + 1] - a[i][j3 + 1];
-                    a[i][j] = x0r + x2r;
-                    a[i][j + 1] = x0i + x2i;
-                    a[i][j2] = x2i - x0i;
-                    a[i][j2 + 1] = x0r - x2r;
-                    x0r = x1r - x3i;
-                    x0i = x1i + x3r;
-                    a[i][j1] = wk1r * (x0r - x0i);
-                    a[i][j1 + 1] = wk1r * (x0r + x0i);
-                    x0r = x3i + x1r;
-                    x0i = x3r - x1i;
-                    a[i][j3] = wk1r * (x0i - x0r);
-                    a[i][j3 + 1] = wk1r * (x0i + x0r);
-                }
-                k1 = 1;
-                ks = -1;
-                for (k = (m << 1); k <= n - m; k += m) {
-                    k1++;
-                    ks = -ks;
-                    wk1r = w[k1 << 1];
-                    wk1i = w[(k1 << 1) + 1];
-                    wk2r = ks * w[k1];
-                    wk2i = w[k1 + ks];
-                    wk3r = wk1r - 2 * wk2i * wk1i;
-                    wk3i = 2 * wk2i * wk1r - wk1i;
-                    for (j = k; j <= l + k - 2; j += 2) {
-                        j1 = j + l;
-                        j2 = j1 + l;
-                        j3 = j2 + l;
-                        x0r = a[i][j] + a[i][j1];
-                        x0i = a[i][j + 1] + a[i][j1 + 1];
-                        x1r = a[i][j] - a[i][j1];
-                        x1i = a[i][j + 1] - a[i][j1 + 1];
-                        x2r = a[i][j2] + a[i][j3];
-                        x2i = a[i][j2 + 1] + a[i][j3 + 1];
-                        x3r = a[i][j2] - a[i][j3];
-                        x3i = a[i][j2 + 1] - a[i][j3 + 1];
-                        a[i][j] = x0r + x2r;
-                        a[i][j + 1] = x0i + x2i;
-                        x0r -= x2r;
-                        x0i -= x2i;
-                        a[i][j2] = wk2r * x0r - wk2i * x0i;
-                        a[i][j2 + 1] = wk2r * x0i + wk2i * x0r;
-                        x0r = x1r - x3i;
-                        x0i = x1i + x3r;
-                        a[i][j1] = wk1r * x0r - wk1i * x0i;
-                        a[i][j1 + 1] = wk1r * x0i + wk1i * x0r;
-                        x0r = x1r + x3i;
-                        x0i = x1i - x3r;
-                        a[i][j3] = wk3r * x0r - wk3i * x0i;
-                        a[i][j3 + 1] = wk3r * x0i + wk3i * x0r;
-                    }
-                }
-            }
-            l = m;
-        }
-        if (l < n) {
-            for (j = 0; j <= l - 2; j += 2) {
-                j1 = j + l;
-                x0r = a[i][j] - a[i][j1];
-                x0i = a[i][j + 1] - a[i][j1 + 1];
-                a[i][j] += a[i][j1];
-                a[i][j + 1] += a[i][j1 + 1];
-                a[i][j1] = x0r;
-                a[i][j1 + 1] = x0i;
-            }
-        }
-    }
-}
-
-
-void cftbrow(int n, int n2, double **a, double *w)
-{
-    int i, j, j1, j2, j3, k, k1, ks, l, m;
-    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    
-    l = 1;
-    while ((l << 1) < n) {
-        m = l << 2;
-        for (j = 0; j <= l - 1; j++) {
-            j1 = j + l;
-            j2 = j1 + l;
-            j3 = j2 + l;
-            for (i = 0; i <= n2 - 2; i += 2) {
-                x0r = a[j][i] + a[j1][i];
-                x0i = a[j][i + 1] + a[j1][i + 1];
-                x1r = a[j][i] - a[j1][i];
-                x1i = a[j][i + 1] - a[j1][i + 1];
-                x2r = a[j2][i] + a[j3][i];
-                x2i = a[j2][i + 1] + a[j3][i + 1];
-                x3r = a[j2][i] - a[j3][i];
-                x3i = a[j2][i + 1] - a[j3][i + 1];
-                a[j][i] = x0r + x2r;
-                a[j][i + 1] = x0i + x2i;
-                a[j2][i] = x0r - x2r;
-                a[j2][i + 1] = x0i - x2i;
-                a[j1][i] = x1r - x3i;
-                a[j1][i + 1] = x1i + x3r;
-                a[j3][i] = x1r + x3i;
-                a[j3][i + 1] = x1i - x3r;
-            }
-        }
-        if (m < n) {
-            wk1r = w[2];
-            for (j = m; j <= l + m - 1; j++) {
-                j1 = j + l;
-                j2 = j1 + l;
-                j3 = j2 + l;
-                for (i = 0; i <= n2 - 2; i += 2) {
-                    x0r = a[j][i] + a[j1][i];
-                    x0i = a[j][i + 1] + a[j1][i + 1];
-                    x1r = a[j][i] - a[j1][i];
-                    x1i = a[j][i + 1] - a[j1][i + 1];
-                    x2r = a[j2][i] + a[j3][i];
-                    x2i = a[j2][i + 1] + a[j3][i + 1];
-                    x3r = a[j2][i] - a[j3][i];
-                    x3i = a[j2][i + 1] - a[j3][i + 1];
-                    a[j][i] = x0r + x2r;
-                    a[j][i + 1] = x0i + x2i;
-                    a[j2][i] = x2i - x0i;
-                    a[j2][i + 1] = x0r - x2r;
-                    x0r = x1r - x3i;
-                    x0i = x1i + x3r;
-                    a[j1][i] = wk1r * (x0r - x0i);
-                    a[j1][i + 1] = wk1r * (x0r + x0i);
-                    x0r = x3i + x1r;
-                    x0i = x3r - x1i;
-                    a[j3][i] = wk1r * (x0i - x0r);
-                    a[j3][i + 1] = wk1r * (x0i + x0r);
-                }
-            }
-            k1 = 1;
-            ks = -1;
-            for (k = (m << 1); k <= n - m; k += m) {
-                k1++;
-                ks = -ks;
-                wk1r = w[k1 << 1];
-                wk1i = w[(k1 << 1) + 1];
-                wk2r = ks * w[k1];
-                wk2i = w[k1 + ks];
-                wk3r = wk1r - 2 * wk2i * wk1i;
-                wk3i = 2 * wk2i * wk1r - wk1i;
-                for (j = k; j <= l + k - 1; j++) {
-                    j1 = j + l;
-                    j2 = j1 + l;
-                    j3 = j2 + l;
-                    for (i = 0; i <= n2 - 2; i += 2) {
-                        x0r = a[j][i] + a[j1][i];
-                        x0i = a[j][i + 1] + a[j1][i + 1];
-                        x1r = a[j][i] - a[j1][i];
-                        x1i = a[j][i + 1] - a[j1][i + 1];
-                        x2r = a[j2][i] + a[j3][i];
-                        x2i = a[j2][i + 1] + a[j3][i + 1];
-                        x3r = a[j2][i] - a[j3][i];
-                        x3i = a[j2][i + 1] - a[j3][i + 1];
-                        a[j][i] = x0r + x2r;
-                        a[j][i + 1] = x0i + x2i;
-                        x0r -= x2r;
-                        x0i -= x2i;
-                        a[j2][i] = wk2r * x0r - wk2i * x0i;
-                        a[j2][i + 1] = wk2r * x0i + wk2i * x0r;
-                        x0r = x1r - x3i;
-                        x0i = x1i + x3r;
-                        a[j1][i] = wk1r * x0r - wk1i * x0i;
-                        a[j1][i + 1] = wk1r * x0i + wk1i * x0r;
-                        x0r = x1r + x3i;
-                        x0i = x1i - x3r;
-                        a[j3][i] = wk3r * x0r - wk3i * x0i;
-                        a[j3][i + 1] = wk3r * x0i + wk3i * x0r;
-                    }
-                }
-            }
-        }
-        l = m;
-    }
-    if (l < n) {
-        for (j = 0; j <= l - 1; j++) {
-            j1 = j + l;
-            for (i = 0; i <= n2 - 2; i += 2) {
-                x0r = a[j][i] - a[j1][i];
-                x0i = a[j][i + 1] - a[j1][i + 1];
-                a[j][i] += a[j1][i];
-                a[j][i + 1] += a[j1][i + 1];
-                a[j1][i] = x0r;
-                a[j1][i + 1] = x0i;
-            }
-        }
-    }
-}
-
-
-void cftfcol(int n1, int n, double **a, double *w)
-{
-    int i, j, j1, j2, j3, k, k1, ks, l, m;
-    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    
-    for (i = 0; i <= n1 - 1; i++) {
-        l = 2;
-        while ((l << 1) < n) {
-            m = l << 2;
-            for (j = 0; j <= l - 2; j += 2) {
-                j1 = j + l;
-                j2 = j1 + l;
-                j3 = j2 + l;
-                x0r = a[i][j] + a[i][j1];
-                x0i = a[i][j + 1] + a[i][j1 + 1];
-                x1r = a[i][j] - a[i][j1];
-                x1i = a[i][j + 1] - a[i][j1 + 1];
-                x2r = a[i][j2] + a[i][j3];
-                x2i = a[i][j2 + 1] + a[i][j3 + 1];
-                x3r = a[i][j2] - a[i][j3];
-                x3i = a[i][j2 + 1] - a[i][j3 + 1];
-                a[i][j] = x0r + x2r;
-                a[i][j + 1] = x0i + x2i;
-                a[i][j2] = x0r - x2r;
-                a[i][j2 + 1] = x0i - x2i;
-                a[i][j1] = x1r + x3i;
-                a[i][j1 + 1] = x1i - x3r;
-                a[i][j3] = x1r - x3i;
-                a[i][j3 + 1] = x1i + x3r;
-            }
-            if (m < n) {
-                wk1r = w[2];
-                for (j = m; j <= l + m - 2; j += 2) {
-                    j1 = j + l;
-                    j2 = j1 + l;
-                    j3 = j2 + l;
-                    x0r = a[i][j] + a[i][j1];
-                    x0i = a[i][j + 1] + a[i][j1 + 1];
-                    x1r = a[i][j] - a[i][j1];
-                    x1i = a[i][j + 1] - a[i][j1 + 1];
-                    x2r = a[i][j2] + a[i][j3];
-                    x2i = a[i][j2 + 1] + a[i][j3 + 1];
-                    x3r = a[i][j2] - a[i][j3];
-                    x3i = a[i][j2 + 1] - a[i][j3 + 1];
-                    a[i][j] = x0r + x2r;
-                    a[i][j + 1] = x0i + x2i;
-                    a[i][j2] = x0i - x2i;
-                    a[i][j2 + 1] = x2r - x0r;
-                    x0r = x1r + x3i;
-                    x0i = x1i - x3r;
-                    a[i][j1] = wk1r * (x0i + x0r);
-                    a[i][j1 + 1] = wk1r * (x0i - x0r);
-                    x0r = x3i - x1r;
-                    x0i = x3r + x1i;
-                    a[i][j3] = wk1r * (x0r + x0i);
-                    a[i][j3 + 1] = wk1r * (x0r - x0i);
-                }
-                k1 = 1;
-                ks = -1;
-                for (k = (m << 1); k <= n - m; k += m) {
-                    k1++;
-                    ks = -ks;
-                    wk1r = w[k1 << 1];
-                    wk1i = w[(k1 << 1) + 1];
-                    wk2r = ks * w[k1];
-                    wk2i = w[k1 + ks];
-                    wk3r = wk1r - 2 * wk2i * wk1i;
-                    wk3i = 2 * wk2i * wk1r - wk1i;
-                    for (j = k; j <= l + k - 2; j += 2) {
-                        j1 = j + l;
-                        j2 = j1 + l;
-                        j3 = j2 + l;
-                        x0r = a[i][j] + a[i][j1];
-                        x0i = a[i][j + 1] + a[i][j1 + 1];
-                        x1r = a[i][j] - a[i][j1];
-                        x1i = a[i][j + 1] - a[i][j1 + 1];
-                        x2r = a[i][j2] + a[i][j3];
-                        x2i = a[i][j2 + 1] + a[i][j3 + 1];
-                        x3r = a[i][j2] - a[i][j3];
-                        x3i = a[i][j2 + 1] - a[i][j3 + 1];
-                        a[i][j] = x0r + x2r;
-                        a[i][j + 1] = x0i + x2i;
-                        x0r -= x2r;
-                        x0i -= x2i;
-                        a[i][j2] = wk2r * x0r + wk2i * x0i;
-                        a[i][j2 + 1] = wk2r * x0i - wk2i * x0r;
-                        x0r = x1r + x3i;
-                        x0i = x1i - x3r;
-                        a[i][j1] = wk1r * x0r + wk1i * x0i;
-                        a[i][j1 + 1] = wk1r * x0i - wk1i * x0r;
-                        x0r = x1r - x3i;
-                        x0i = x1i + x3r;
-                        a[i][j3] = wk3r * x0r + wk3i * x0i;
-                        a[i][j3 + 1] = wk3r * x0i - wk3i * x0r;
-                    }
-                }
-            }
-            l = m;
-        }
-        if (l < n) {
-            for (j = 0; j <= l - 2; j += 2) {
-                j1 = j + l;
-                x0r = a[i][j] - a[i][j1];
-                x0i = a[i][j + 1] - a[i][j1 + 1];
-                a[i][j] += a[i][j1];
-                a[i][j + 1] += a[i][j1 + 1];
-                a[i][j1] = x0r;
-                a[i][j1 + 1] = x0i;
-            }
-        }
-    }
-}
-
-
-void cftfrow(int n, int n2, double **a, double *w)
-{
-    int i, j, j1, j2, j3, k, k1, ks, l, m;
-    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    
-    l = 1;
-    while ((l << 1) < n) {
-        m = l << 2;
-        for (j = 0; j <= l - 1; j++) {
-            j1 = j + l;
-            j2 = j1 + l;
-            j3 = j2 + l;
-            for (i = 0; i <= n2 - 2; i += 2) {
-                x0r = a[j][i] + a[j1][i];
-                x0i = a[j][i + 1] + a[j1][i + 1];
-                x1r = a[j][i] - a[j1][i];
-                x1i = a[j][i + 1] - a[j1][i + 1];
-                x2r = a[j2][i] + a[j3][i];
-                x2i = a[j2][i + 1] + a[j3][i + 1];
-                x3r = a[j2][i] - a[j3][i];
-                x3i = a[j2][i + 1] - a[j3][i + 1];
-                a[j][i] = x0r + x2r;
-                a[j][i + 1] = x0i + x2i;
-                a[j2][i] = x0r - x2r;
-                a[j2][i + 1] = x0i - x2i;
-                a[j1][i] = x1r + x3i;
-                a[j1][i + 1] = x1i - x3r;
-                a[j3][i] = x1r - x3i;
-                a[j3][i + 1] = x1i + x3r;
-            }
-        }
-        if (m < n) {
-            wk1r = w[2];
-            for (j = m; j <= l + m - 1; j++) {
-                j1 = j + l;
-                j2 = j1 + l;
-                j3 = j2 + l;
-                for (i = 0; i <= n2 - 2; i += 2) {
-                    x0r = a[j][i] + a[j1][i];
-                    x0i = a[j][i + 1] + a[j1][i + 1];
-                    x1r = a[j][i] - a[j1][i];
-                    x1i = a[j][i + 1] - a[j1][i + 1];
-                    x2r = a[j2][i] + a[j3][i];
-                    x2i = a[j2][i + 1] + a[j3][i + 1];
-                    x3r = a[j2][i] - a[j3][i];
-                    x3i = a[j2][i + 1] - a[j3][i + 1];
-                    a[j][i] = x0r + x2r;
-                    a[j][i + 1] = x0i + x2i;
-                    a[j2][i] = x0i - x2i;
-                    a[j2][i + 1] = x2r - x0r;
-                    x0r = x1r + x3i;
-                    x0i = x1i - x3r;
-                    a[j1][i] = wk1r * (x0i + x0r);
-                    a[j1][i + 1] = wk1r * (x0i - x0r);
-                    x0r = x3i - x1r;
-                    x0i = x3r + x1i;
-                    a[j3][i] = wk1r * (x0r + x0i);
-                    a[j3][i + 1] = wk1r * (x0r - x0i);
-                }
-            }
-            k1 = 1;
-            ks = -1;
-            for (k = (m << 1); k <= n - m; k += m) {
-                k1++;
-                ks = -ks;
-                wk1r = w[k1 << 1];
-                wk1i = w[(k1 << 1) + 1];
-                wk2r = ks * w[k1];
-                wk2i = w[k1 + ks];
-                wk3r = wk1r - 2 * wk2i * wk1i;
-                wk3i = 2 * wk2i * wk1r - wk1i;
-                for (j = k; j <= l + k - 1; j++) {
-                    j1 = j + l;
-                    j2 = j1 + l;
-                    j3 = j2 + l;
-                    for (i = 0; i <= n2 - 2; i += 2) {
-                        x0r = a[j][i] + a[j1][i];
-                        x0i = a[j][i + 1] + a[j1][i + 1];
-                        x1r = a[j][i] - a[j1][i];
-                        x1i = a[j][i + 1] - a[j1][i + 1];
-                        x2r = a[j2][i] + a[j3][i];
-                        x2i = a[j2][i + 1] + a[j3][i + 1];
-                        x3r = a[j2][i] - a[j3][i];
-                        x3i = a[j2][i + 1] - a[j3][i + 1];
-                        a[j][i] = x0r + x2r;
-                        a[j][i + 1] = x0i + x2i;
-                        x0r -= x2r;
-                        x0i -= x2i;
-                        a[j2][i] = wk2r * x0r + wk2i * x0i;
-                        a[j2][i + 1] = wk2r * x0i - wk2i * x0r;
-                        x0r = x1r + x3i;
-                        x0i = x1i - x3r;
-                        a[j1][i] = wk1r * x0r + wk1i * x0i;
-                        a[j1][i + 1] = wk1r * x0i - wk1i * x0r;
-                        x0r = x1r - x3i;
-                        x0i = x1i + x3r;
-                        a[j3][i] = wk3r * x0r + wk3i * x0i;
-                        a[j3][i + 1] = wk3r * x0i - wk3i * x0r;
-                    }
-                }
-            }
-        }
-        l = m;
-    }
-    if (l < n) {
-        for (j = 0; j <= l - 1; j++) {
-            j1 = j + l;
-            for (i = 0; i <= n2 - 2; i += 2) {
-                x0r = a[j][i] - a[j1][i];
-                x0i = a[j][i + 1] - a[j1][i + 1];
-                a[j][i] += a[j1][i];
-                a[j][i + 1] += a[j1][i + 1];
-                a[j1][i] = x0r;
-                a[j1][i + 1] = x0i;
-            }
-        }
-    }
-}
-
-
-void rftbcol(int n1, int n, double **a, int nc, double *c)
-{
-    int i, j, k, kk, ks;
-    double wkr, wki, xr, xi, yr, yi;
-    
-    ks = (nc << 2) / n;
-    for (i = 0; i <= n1 - 1; i++) {
-        kk = 0;
-        for (k = (n >> 1) - 2; k >= 2; k -= 2) {
-            j = n - k;
-            kk += ks;
-            wkr = 0.5 - c[kk];
-            wki = c[nc - kk];
-            xr = a[i][k] - a[i][j];
-            xi = a[i][k + 1] + a[i][j + 1];
-            yr = wkr * xr - wki * xi;
-            yi = wkr * xi + wki * xr;
-            a[i][k] -= yr;
-            a[i][k + 1] -= yi;
-            a[i][j] += yr;
-            a[i][j + 1] -= yi;
-        }
-    }
-}
-
-
-void rftfcol(int n1, int n, double **a, int nc, double *c)
-{
-    int i, j, k, kk, ks;
-    double wkr, wki, xr, xi, yr, yi;
-    
-    ks = (nc << 2) / n;
-    for (i = 0; i <= n1 - 1; i++) {
-        kk = 0;
-        for (k = (n >> 1) - 2; k >= 2; k -= 2) {
-            j = n - k;
-            kk += ks;
-            wkr = 0.5 - c[kk];
-            wki = c[nc - kk];
-            xr = a[i][k] - a[i][j];
-            xi = a[i][k + 1] + a[i][j + 1];
-            yr = wkr * xr + wki * xi;
-            yi = wkr * xi - wki * xr;
-            a[i][k] -= yr;
-            a[i][k + 1] -= yi;
-            a[i][j] += yr;
-            a[i][j + 1] -= yi;
-        }
-    }
-}
-
-
-void dctbsub(int n1, int n2, double **a, int nc, double *c)
-{
-    int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
-    double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
-    
-    ks1 = nc / n1;
-    ks2 = nc / n2;
-    n1h = n1 >> 1;
-    kk1 = ks1;
-    for (k1 = 1; k1 <= n1h - 1; k1++) {
-        j1 = n1 - k1;
-        w1r = 2 * c[kk1];
-        w1i = 2 * c[nc - kk1];
-        kk1 += ks1;
-        kk2 = ks2;
-        for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
-            x0r = w1r * c[kk2];
-            x0i = w1i * c[kk2];
-            x1r = w1r * c[nc - kk2];
-            x1i = w1i * c[nc - kk2];
-            wkr = x0r - x1i;
-            wki = x0i + x1r;
-            wji = x0r + x1i;
-            wjr = x0i - x1r;
-            kk2 += ks2;
-            x0r = wkr * a[k1][k2] - wki * a[k1][k2 + 1];
-            x0i = wkr * a[k1][k2 + 1] + wki * a[k1][k2];
-            x1r = wjr * a[j1][k2] - wji * a[j1][k2 + 1];
-            x1i = wjr * a[j1][k2 + 1] + wji * a[j1][k2];
-            a[k1][k2] = x0r + x1i;
-            a[k1][k2 + 1] = x0i - x1r;
-            a[j1][k2] = x1r + x0i;
-            a[j1][k2 + 1] = x1i - x0r;
-        }
-        wkr = w1r * 0.5;
-        wki = w1i * 0.5;
-        wjr = w1r * c[kk2];
-        wji = w1i * c[kk2];
-        x0r = a[k1][0] + a[j1][0];
-        x0i = a[k1][1] - a[j1][1];
-        x1r = a[k1][0] - a[j1][0];
-        x1i = a[k1][1] + a[j1][1];
-        a[k1][0] = wkr * x0r - wki * x0i;
-        a[k1][1] = wkr * x0i + wki * x0r;
-        a[j1][0] = -wjr * x1r + wji * x1i;
-        a[j1][1] = wjr * x1i + wji * x1r;
-    }
-    w1r = 2 * c[kk1];
-    kk2 = ks2;
-    for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
-        wkr = 2 * c[kk2];
-        wki = 2 * c[nc - kk2];
-        wjr = w1r * wkr;
-        wji = w1r * wki;
-        kk2 += ks2;
-        x0i = wkr * a[0][k2 + 1] + wki * a[0][k2];
-        a[0][k2] = wkr * a[0][k2] - wki * a[0][k2 + 1];
-        a[0][k2 + 1] = x0i;
-        x0i = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
-        a[n1h][k2] = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
-        a[n1h][k2 + 1] = x0i;
-    }
-    a[0][1] *= w1r;
-    a[n1h][0] *= w1r;
-    a[n1h][1] *= 0.5;
-}
-
-
-void dctfsub(int n1, int n2, double **a, int nc, double *c)
-{
-    int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
-    double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
-    
-    ks1 = nc / n1;
-    ks2 = nc / n2;
-    n1h = n1 >> 1;
-    kk1 = ks1;
-    for (k1 = 1; k1 <= n1h - 1; k1++) {
-        j1 = n1 - k1;
-        w1r = 2 * c[kk1];
-        w1i = 2 * c[nc - kk1];
-        kk1 += ks1;
-        kk2 = ks2;
-        for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
-            x0r = w1r * c[kk2];
-            x0i = w1i * c[kk2];
-            x1r = w1r * c[nc - kk2];
-            x1i = w1i * c[nc - kk2];
-            wkr = x0r - x1i;
-            wki = x0i + x1r;
-            wji = x0r + x1i;
-            wjr = x0i - x1r;
-            kk2 += ks2;
-            x0r = a[k1][k2] - a[j1][k2 + 1];
-            x0i = a[j1][k2] + a[k1][k2 + 1];
-            x1r = a[j1][k2] - a[k1][k2 + 1];
-            x1i = a[k1][k2] + a[j1][k2 + 1];
-            a[k1][k2] = wkr * x0r + wki * x0i;
-            a[k1][k2 + 1] = wkr * x0i - wki * x0r;
-            a[j1][k2] = wjr * x1r + wji * x1i;
-            a[j1][k2 + 1] = wjr * x1i - wji * x1r;
-        }
-        x0r = 2 * c[kk2];
-        wjr = x0r * w1r;
-        wji = x0r * w1i;
-        x0r = w1r * a[k1][0] + w1i * a[k1][1];
-        x0i = w1r * a[k1][1] - w1i * a[k1][0];
-        x1r = -wjr * a[j1][0] + wji * a[j1][1];
-        x1i = wjr * a[j1][1] + wji * a[j1][0];
-        a[k1][0] = x0r + x1r;
-        a[k1][1] = x1i + x0i;
-        a[j1][0] = x0r - x1r;
-        a[j1][1] = x1i - x0i;
-    }
-    w1r = 2 * c[kk1];
-    kk2 = ks2;
-    for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
-        wkr = 2 * c[kk2];
-        wki = 2 * c[nc - kk2];
-        wjr = w1r * wkr;
-        wji = w1r * wki;
-        kk2 += ks2;
-        x0i = wkr * a[0][k2 + 1] - wki * a[0][k2];
-        a[0][k2] = wkr * a[0][k2] + wki * a[0][k2 + 1];
-        a[0][k2 + 1] = x0i;
-        x0i = wjr * a[n1h][k2 + 1] - wji * a[n1h][k2];
-        a[n1h][k2] = wjr * a[n1h][k2] + wji * a[n1h][k2 + 1];
-        a[n1h][k2 + 1] = x0i;
-    }
-    w1r *= 2;
-    a[0][0] *= 2;
-    a[0][1] *= w1r;
-    a[n1h][0] *= w1r;
-}
-
-
-void dstbsub(int n1, int n2, double **a, int nc, double *c)
-{
-    int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
-    double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
-    
-    ks1 = nc / n1;
-    ks2 = nc / n2;
-    n1h = n1 >> 1;
-    kk1 = ks1;
-    for (k1 = 1; k1 <= n1h - 1; k1++) {
-        j1 = n1 - k1;
-        w1r = 2 * c[kk1];
-        w1i = 2 * c[nc - kk1];
-        kk1 += ks1;
-        kk2 = ks2;
-        for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
-            x0r = w1r * c[kk2];
-            x0i = w1i * c[kk2];
-            x1r = w1r * c[nc - kk2];
-            x1i = w1i * c[nc - kk2];
-            wkr = x0r - x1i;
-            wki = x0i + x1r;
-            wji = x0r + x1i;
-            wjr = x0i - x1r;
-            kk2 += ks2;
-            x0r = wkr * a[k1][k2] - wki * a[k1][k2 + 1];
-            x0i = wkr * a[k1][k2 + 1] + wki * a[k1][k2];
-            x1r = wjr * a[j1][k2] - wji * a[j1][k2 + 1];
-            x1i = wjr * a[j1][k2 + 1] + wji * a[j1][k2];
-            a[k1][k2] = x1i - x0r;
-            a[k1][k2 + 1] = x1r + x0i;
-            a[j1][k2] = x0i - x1r;
-            a[j1][k2 + 1] = x0r + x1i;
-        }
-        wkr = w1r * 0.5;
-        wki = w1i * 0.5;
-        wjr = w1r * c[kk2];
-        wji = w1i * c[kk2];
-        x0r = a[k1][0] + a[j1][0];
-        x0i = a[k1][1] - a[j1][1];
-        x1r = a[k1][0] - a[j1][0];
-        x1i = a[k1][1] + a[j1][1];
-        a[k1][1] = wkr * x0r - wki * x0i;
-        a[k1][0] = wkr * x0i + wki * x0r;
-        a[j1][1] = -wjr * x1r + wji * x1i;
-        a[j1][0] = wjr * x1i + wji * x1r;
-    }
-    w1r = 2 * c[kk1];
-    kk2 = ks2;
-    for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
-        wkr = 2 * c[kk2];
-        wki = 2 * c[nc - kk2];
-        wjr = w1r * wkr;
-        wji = w1r * wki;
-        kk2 += ks2;
-        x0i = wkr * a[0][k2 + 1] + wki * a[0][k2];
-        a[0][k2 + 1] = wkr * a[0][k2] - wki * a[0][k2 + 1];
-        a[0][k2] = x0i;
-        x0i = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
-        a[n1h][k2 + 1] = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
-        a[n1h][k2] = x0i;
-    }
-    a[0][1] *= w1r;
-    a[n1h][0] *= w1r;
-    a[n1h][1] *= 0.5;
-}
-
-
-void dstfsub(int n1, int n2, double **a, int nc, double *c)
-{
-    int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
-    double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
-    
-    ks1 = nc / n1;
-    ks2 = nc / n2;
-    n1h = n1 >> 1;
-    kk1 = ks1;
-    for (k1 = 1; k1 <= n1h - 1; k1++) {
-        j1 = n1 - k1;
-        w1r = 2 * c[kk1];
-        w1i = 2 * c[nc - kk1];
-        kk1 += ks1;
-        kk2 = ks2;
-        for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
-            x0r = w1r * c[kk2];
-            x0i = w1i * c[kk2];
-            x1r = w1r * c[nc - kk2];
-            x1i = w1i * c[nc - kk2];
-            wkr = x0r - x1i;
-            wki = x0i + x1r;
-            wji = x0r + x1i;
-            wjr = x0i - x1r;
-            kk2 += ks2;
-            x0r = a[j1][k2 + 1] - a[k1][k2];
-            x0i = a[k1][k2 + 1] + a[j1][k2];
-            x1r = a[k1][k2 + 1] - a[j1][k2];
-            x1i = a[j1][k2 + 1] + a[k1][k2];
-            a[k1][k2] = wkr * x0r + wki * x0i;
-            a[k1][k2 + 1] = wkr * x0i - wki * x0r;
-            a[j1][k2] = wjr * x1r + wji * x1i;
-            a[j1][k2 + 1] = wjr * x1i - wji * x1r;
-        }
-        x0r = 2 * c[kk2];
-        wjr = x0r * w1r;
-        wji = x0r * w1i;
-        x0r = w1r * a[k1][1] + w1i * a[k1][0];
-        x0i = w1r * a[k1][0] - w1i * a[k1][1];
-        x1r = -wjr * a[j1][1] + wji * a[j1][0];
-        x1i = wjr * a[j1][0] + wji * a[j1][1];
-        a[k1][0] = x0r + x1r;
-        a[k1][1] = x1i + x0i;
-        a[j1][0] = x0r - x1r;
-        a[j1][1] = x1i - x0i;
-    }
-    w1r = 2 * c[kk1];
-    kk2 = ks2;
-    for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
-        wkr = 2 * c[kk2];
-        wki = 2 * c[nc - kk2];
-        wjr = w1r * wkr;
-        wji = w1r * wki;
-        kk2 += ks2;
-        x0i = wkr * a[0][k2] - wki * a[0][k2 + 1];
-        a[0][k2] = wkr * a[0][k2 + 1] + wki * a[0][k2];
-        a[0][k2 + 1] = x0i;
-        x0i = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
-        a[n1h][k2] = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
-        a[n1h][k2 + 1] = x0i;
-    }
-    w1r *= 2;
-    a[0][0] *= 2;
-    a[0][1] *= w1r;
-    a[n1h][0] *= w1r;
-}
-
diff --git a/third_party/tensorflow_dependencies/fft2d/fft4f2d.f b/third_party/tensorflow_dependencies/fft2d/fft4f2d.f
deleted file mode 100644
index af529e3..0000000
--- a/third_party/tensorflow_dependencies/fft2d/fft4f2d.f
+++ /dev/null
@@ -1,1591 +0,0 @@
-! Fast Fourier/Cosine/Sine Transform
-!     dimension   :two
-!     data length :power of 2
-!     decimation  :frequency
-!     radix       :4, 2, row-column
-!     data        :inplace
-!     table       :use
-! subroutines
-!     cdft2d: Complex Discrete Fourier Transform
-!     rdft2d: Real Discrete Fourier Transform
-!     ddct2d: Discrete Cosine Transform
-!     ddst2d: Discrete Sine Transform
-!
-!
-! -------- Complex DFT (Discrete Fourier Transform) --------
-!     [definition]
-!         <case1>
-!             X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) * 
-!                            exp(2*pi*i*j1*k1/n1) * 
-!                            exp(2*pi*i*j2*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         <case2>
-!             X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) * 
-!                            exp(-2*pi*i*j1*k1/n1) * 
-!                            exp(-2*pi*i*j2*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call cdft2d(n1max, 2*n1, n2, 1, a, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call cdft2d(n1max, 2*n1, n2, -1, a, ip, w)
-!     [parameters]
-!         n1max  :row size of the 2D array (integer)
-!         2*n1   :data length (integer)
-!                 n1 >= 1, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 1, n2 = power of 2
-!         a(0:2*n1-1,0:n2-1)
-!                :input/output data (real*8)
-!                 input data
-!                     a(2*j1,j2) = Re(x(j1,j2)), 
-!                     a(2*j1+1,j2) = Im(x(j1,j2)), 
-!                     0<=j1<n1, 0<=j2<n2
-!                 output data
-!                     a(2*k1,k2) = Re(X(k1,k2)), 
-!                     a(2*k1+1,k2) = Im(X(k1,k2)), 
-!                     0<=k1<n1, 0<=k2<n2
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1, n2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1/2, n2/2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call cdft2d(n1max, 2*n1, n2, -1, a, ip, w)
-!         is 
-!             call cdft2d(n1max, 2*n1, n2, 1, a, ip, w)
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, 2 * n1 - 1
-!                     a(j1, j2) = a(j1, j2) * (1.0d0 / (n1 * n2))
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- Real DFT / Inverse of Real DFT --------
-!     [definition]
-!         <case1> RDFT
-!             R(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!             I(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         <case2> IRDFT (excluding scale)
-!             a(k1,k2) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
-!                            (R(j1,j2) * 
-!                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + 
-!                            I(j1,j2) * 
-!                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 
-!                            0<=k1<n1, 0<=k2<n2
-!         (notes: R(n1-k1,n2-k2) = R(k1,k2), 
-!                 I(n1-k1,n2-k2) = -I(k1,k2), 
-!                 R(n1-k1,0) = R(k1,0), 
-!                 I(n1-k1,0) = -I(k1,0), 
-!                 R(0,n2-k2) = R(0,k2), 
-!                 I(0,n2-k2) = -I(0,k2), 
-!                 0<k1<n1, 0<k2<n2)
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call rdft2d(n1max, n1, n2, 1, a, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call rdft2d(n1max, n1, n2, -1, a, ip, w)
-!     [parameters]
-!         n1max  :row size of the 2D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         a(0:n1-1,0:n2-1)
-!                :input/output data (real*8)
-!                 <case1>
-!                     output data
-!                         a(2*k1,k2) = R(k1,k2) = R(n1-k1,n2-k2), 
-!                         a(2*k1+1,k2) = I(k1,k2) = -I(n1-k1,n2-k2), 
-!                             0<k1<n1/2, 0<k2<n2, 
-!                         a(2*k1,0) = R(k1,0) = R(n1-k1,0), 
-!                         a(2*k1+1,0) = I(k1,0) = -I(n1-k1,0), 
-!                             0<k1<n1/2, 
-!                         a(0,k2) = R(0,k2) = R(0,n2-k2), 
-!                         a(1,k2) = I(0,k2) = -I(0,n2-k2), 
-!                         a(1,n2-k2) = R(n1/2,k2) = R(n1/2,n2-k2), 
-!                         a(0,n2-k2) = -I(n1/2,k2) = I(n1/2,n2-k2), 
-!                             0<k2<n2/2, 
-!                         a(0,0) = R(0,0), 
-!                         a(1,0) = R(n1/2,0), 
-!                         a(0,n2/2) = R(0,n2/2), 
-!                         a(1,n2/2) = R(n1/2,n2/2)
-!                 <case2>
-!                     input data
-!                         a(2*j1,j2) = R(j1,j2) = R(n1-j1,n2-j2), 
-!                         a(2*j1+1,j2) = I(j1,j2) = -I(n1-j1,n2-j2), 
-!                             0<j1<n1/2, 0<j2<n2, 
-!                         a(2*j1,0) = R(j1,0) = R(n1-j1,0), 
-!                         a(2*j1+1,0) = I(j1,0) = -I(n1-j1,0), 
-!                             0<j1<n1/2, 
-!                         a(0,j2) = R(0,j2) = R(0,n2-j2), 
-!                         a(1,j2) = I(0,j2) = -I(0,n2-j2), 
-!                         a(1,n2-j2) = R(n1/2,j2) = R(n1/2,n2-j2), 
-!                         a(0,n2-j2) = -I(n1/2,j2) = I(n1/2,n2-j2), 
-!                             0<j2<n2/2, 
-!                         a(0,0) = R(0,0), 
-!                         a(1,0) = R(n1/2,0), 
-!                         a(0,n2/2) = R(0,n2/2), 
-!                         a(1,n2/2) = R(n1/2,n2/2)
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1/4, n2/2) + n1/4
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call rdft2d(n1max, n1, n2, 1, a, ip, w)
-!         is 
-!             call rdft2d(n1max, n1, n2, -1, a, ip, w)
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, n1 - 1
-!                     a(j1, j2) = a(j1, j2) * (2.0d0 / (n1 * n2))
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
-!     [definition]
-!         <case1> IDCT (excluding scale)
-!             C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            cos(pi*j1*(k1+1/2)/n1) * 
-!                            cos(pi*j2*(k2+1/2)/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         <case2> DCT
-!             C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            cos(pi*(j1+1/2)*k1/n1) * 
-!                            cos(pi*(j2+1/2)*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row size of the 2D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         a(0:n1-1,0:n2-1)
-!                :input/output data (real*8)
-!                 output data
-!                     a(k1,k2) = C(k1,k2), 0<=k1<n1, 0<=k2<n2
-!         t(0:n1-1,0:n2-1)
-!                :work area (real*8)
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1/4, n2/2) + max(n1, n2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
-!         is 
-!             do j1 = 0, n1 - 1
-!                 a(j1, 0) = a(j1, 0) * 0.5d0
-!             end do
-!             do j2 = 0, n2 - 1
-!                 a(0, j2) = a(0, j2) * 0.5d0
-!             end do
-!             call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, n1 - 1
-!                     a(j1, j2) = a(j1, j2) * (4.0d0 / (n1 * n2))
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- DST (Discrete Sine Transform) / Inverse of DST --------
-!     [definition]
-!         <case1> IDST (excluding scale)
-!             S(k1,k2) = sum_j1=1^n1 sum_j2=1^n2 A(j1,j2) * 
-!                            sin(pi*j1*(k1+1/2)/n1) * 
-!                            sin(pi*j2*(k2+1/2)/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         <case2> DST
-!             S(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            sin(pi*(j1+1/2)*k1/n1) * 
-!                            sin(pi*(j2+1/2)*k2/n2), 
-!                            0<k1<=n1, 0<k2<=n2
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row size of the 2D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         a(0:n1-1,0:n2-1)
-!                :input/output data (real*8)
-!                 <case1>
-!                     input data
-!                         a(j1,j2) = A(j1,j2), 0<j1<n1, 0<j2<n2, 
-!                         a(j1,0) = A(j1,n2), 0<j1<n1, 
-!                         a(0,j2) = A(n1,j2), 0<j2<n2, 
-!                         a(0,0) = A(n1,n2)
-!                         (i.e. A(j1,j2) = a(mod(j1,n1),mod(j2,n2)))
-!                     output data
-!                         a(k1,k2) = S(k1,k2), 0<=k1<n1, 0<=k2<n2
-!                 <case2>
-!                     output data
-!                         a(k1,k2) = S(k1,k2), 0<k1<n1, 0<k2<n2, 
-!                         a(k1,0) = S(k1,n2), 0<k1<n1, 
-!                         a(0,k2) = S(n1,k2), 0<k2<n2, 
-!                         a(0,0) = S(n1,n2)
-!                         (i.e. S(k1,k2) = a(mod(k1,n1),mod(k2,n2)))
-!         t(0:n1-1,0:n2-1)
-!                :work area (real*8)
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1/4, n2/2) + max(n1, n2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
-!         is 
-!             do j1 = 0, n1 - 1
-!                 a(j1, 0) = a(j1, 0) * 0.5d0
-!             end do
-!             do j2 = 0, n2 - 1
-!                 a(0, j2) = a(0, j2) * 0.5d0
-!             end do
-!             call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, n1 - 1
-!                     a(j1, j2) = a(j1, j2) * (4.0d0 / (n1 * n2))
-!                 end do
-!             end do
-!         .
-!
-!
-      subroutine cdft2d(n1max, n1, n2, isgn, a, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), n
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
-      n = max(n1, 2 * n2)
-      if (n .gt. 4 * ip(0)) then
-          call makewt(n / 4, ip, w)
-      end if
-      if (n1 .gt. 4) then
-          call bitrv2row(n1max, n1, n2, ip(2), a)
-      end if
-      if (n2 .gt. 2) then
-          call bitrv2col(n1max, n1, n2, ip(2), a)
-      end if
-      if (isgn .lt. 0) then
-          call cftfrow(n1max, n1, n2, a, w)
-          call cftfcol(n1max, n1, n2, a, w)
-      else
-          call cftbrow(n1max, n1, n2, a, w)
-          call cftbcol(n1max, n1, n2, a, w)
-      end if
-      end
-!
-      subroutine rdft2d(n1max, n1, n2, isgn, a, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n2h, i, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
-      n = max(n1, 2 * n2)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n1 .gt. 4 * nc) then
-          nc = n1 / 4
-          call makect(nc, ip, w(nw))
-      end if
-      n2h = n2 / 2
-      if (isgn .lt. 0) then
-          do i = 1, n2h - 1
-              j = n2 - i
-              xi = a(0, i) - a(0, j)
-              a(0, i) = a(0, i) + a(0, j)
-              a(0, j) = xi
-              xi = a(1, j) - a(1, i)
-              a(1, i) = a(1, i) + a(1, j)
-              a(1, j) = xi
-          end do
-          if (n2 .gt. 2) then
-              call bitrv2col(n1max, n1, n2, ip(2), a)
-          end if
-          call cftfcol(n1max, n1, n2, a, w)
-          do i = 0, n2 - 1
-              a(1, i) = 0.5d0 * (a(0, i) - a(1, i))
-              a(0, i) = a(0, i) - a(1, i)
-          end do
-          if (n1 .gt. 4) then
-              call rftfrow(n1max, n1, n2, a, nc, w(nw))
-              call bitrv2row(n1max, n1, n2, ip(2), a)
-          end if
-          call cftfrow(n1max, n1, n2, a, w)
-      else
-          if (n1 .gt. 4) then
-              call bitrv2row(n1max, n1, n2, ip(2), a)
-          end if
-          call cftbrow(n1max, n1, n2, a, w)
-          if (n1 .gt. 4) then
-              call rftbrow(n1max, n1, n2, a, nc, w(nw))
-          end if
-          do i = 0, n2 - 1
-              xi = a(0, i) - a(1, i)
-              a(0, i) = a(0, i) + a(1, i)
-              a(1, i) = xi
-          end do
-          if (n2 .gt. 2) then
-              call bitrv2col(n1max, n1, n2, ip(2), a)
-          end if
-          call cftbcol(n1max, n1, n2, a, w)
-          do i = 1, n2h - 1
-              j = n2 - i
-              a(0, j) = 0.5d0 * (a(0, i) - a(0, j))
-              a(0, i) = a(0, i) - a(0, j)
-              a(1, j) = 0.5d0 * (a(1, i) + a(1, j))
-              a(1, i) = a(1, i) - a(1, j)
-          end do
-      end if
-      end
-!
-      subroutine ddct2d(n1max, n1, n2, isgn, a, t, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n1h, n2h, 
-     &    i, ix, ic, j, jx, jc
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), 
-     &    t(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
-      n = max(n1, 2 * n2)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n1 .gt. nc .or. n2 .gt. nc) then
-          nc = max(n1, n2)
-          call makect(nc, ip, w(nw))
-      end if
-      n1h = n1 / 2
-      n2h = n2 / 2
-      if (isgn .ge. 0) then
-          do i = 0, n2 - 1
-              do j = 1, n1h - 1
-                  jx = 2 * j
-                  t(jx, i) = a(j, i)
-                  t(jx + 1, i) = a(n1 - j, i)
-              end do
-          end do
-          t(0, 0) = a(0, 0)
-          t(1, 0) = a(n1h, 0)
-          t(0, n2h) = a(0, n2h)
-          t(1, n2h) = a(n1h, n2h)
-          do i = 1, n2h - 1
-              ic = n2 - i
-              t(0, i) = a(0, i)
-              t(1, ic) = a(n1h, i)
-              t(1, i) = a(0, ic)
-              t(0, ic) = a(n1h, ic)
-          end do
-          call dctfsub(n1max, n1, n2, t, nc, w(nw))
-          if (n2 .gt. 2) then
-              call bitrv2col(n1max, n1, n2, ip(2), t)
-          end if
-          call cftfcol(n1max, n1, n2, t, w)
-          do i = 0, n2 - 1
-              t(1, i) = 0.5d0 * (t(0, i) - t(1, i))
-              t(0, i) = t(0, i) - t(1, i)
-          end do
-          if (n1 .gt. 4) then
-              call rftfrow(n1max, n1, n2, t, nc, w(nw))
-              call bitrv2row(n1max, n1, n2, ip(2), t)
-          end if
-          call cftfrow(n1max, n1, n2, t, w)
-          do i = 0, n2h - 1
-              ix = 2 * i
-              ic = n2 - 1 - i
-              do j = 0, n1h - 1
-                  jx = 2 * j
-                  jc = n1 - 1 - j
-                  a(jx, ix) = t(j, i)
-                  a(jx + 1, ix) = t(jc, i)
-                  a(jx, ix + 1) = t(j, ic)
-                  a(jx + 1, ix + 1) = t(jc, ic)
-              end do
-          end do
-      else
-          do i = 0, n2h - 1
-              ix = 2 * i
-              ic = n2 - 1 - i
-              do j = 0, n1h - 1
-                  jx = 2 * j
-                  jc = n1 - 1 - j
-                  t(j, i) = a(jx, ix)
-                  t(jc, i) = a(jx + 1, ix)
-                  t(j, ic) = a(jx, ix + 1)
-                  t(jc, ic) = a(jx + 1, ix + 1)
-              end do
-          end do
-          if (n1 .gt. 4) then
-              call bitrv2row(n1max, n1, n2, ip(2), t)
-          end if
-          call cftbrow(n1max, n1, n2, t, w)
-          if (n1 .gt. 4) then
-              call rftbrow(n1max, n1, n2, t, nc, w(nw))
-          end if
-          do i = 0, n2 - 1
-              xi = t(0, i) - t(1, i)
-              t(0, i) = t(0, i) + t(1, i)
-              t(1, i) = xi
-          end do
-          if (n2 .gt. 2) then
-              call bitrv2col(n1max, n1, n2, ip(2), t)
-          end if
-          call cftbcol(n1max, n1, n2, t, w)
-          call dctbsub(n1max, n1, n2, t, nc, w(nw))
-          do i = 0, n2 - 1
-              do j = 1, n1h - 1
-                  jx = 2 * j
-                  a(j, i) = t(jx, i)
-                  a(n1 - j, i) = t(jx + 1, i)
-              end do
-          end do
-          a(0, 0) = t(0, 0)
-          a(n1h, 0) = t(1, 0)
-          a(0, n2h) = t(0, n2h)
-          a(n1h, n2h) = t(1, n2h)
-          do i = 1, n2h - 1
-              ic = n2 - i
-              a(0, i) = t(0, i)
-              a(n1h, i) = t(1, ic)
-              a(0, ic) = t(1, i)
-              a(n1h, ic) = t(0, ic)
-          end do
-      end if
-      end
-!
-      subroutine ddst2d(n1max, n1, n2, isgn, a, t, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n1h, n2h, 
-     &    i, ix, ic, j, jx, jc
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), 
-     &    t(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
-      n = max(n1, 2 * n2)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n1 .gt. nc .or. n2 .gt. nc) then
-          nc = max(n1, n2)
-          call makect(nc, ip, w(nw))
-      end if
-      n1h = n1 / 2
-      n2h = n2 / 2
-      if (isgn .ge. 0) then
-          do i = 0, n2 - 1
-              do j = 1, n1h - 1
-                  jx = 2 * j
-                  t(jx, i) = a(j, i)
-                  t(jx + 1, i) = a(n1 - j, i)
-              end do
-          end do
-          t(0, 0) = a(0, 0)
-          t(1, 0) = a(n1h, 0)
-          t(0, n2h) = a(0, n2h)
-          t(1, n2h) = a(n1h, n2h)
-          do i = 1, n2h - 1
-              ic = n2 - i
-              t(0, i) = a(0, i)
-              t(1, ic) = a(n1h, i)
-              t(1, i) = a(0, ic)
-              t(0, ic) = a(n1h, ic)
-          end do
-          call dstfsub(n1max, n1, n2, t, nc, w(nw))
-          if (n2 .gt. 2) then
-              call bitrv2col(n1max, n1, n2, ip(2), t)
-          end if
-          call cftfcol(n1max, n1, n2, t, w)
-          do i = 0, n2 - 1
-              t(1, i) = 0.5d0 * (t(0, i) - t(1, i))
-              t(0, i) = t(0, i) - t(1, i)
-          end do
-          if (n1 .gt. 4) then
-              call rftfrow(n1max, n1, n2, t, nc, w(nw))
-              call bitrv2row(n1max, n1, n2, ip(2), t)
-          end if
-          call cftfrow(n1max, n1, n2, t, w)
-          do i = 0, n2h - 1
-              ix = 2 * i
-              ic = n2 - 1 - i
-              do j = 0, n1h - 1
-                  jx = 2 * j
-                  jc = n1 - 1 - j
-                  a(jx, ix) = t(j, i)
-                  a(jx + 1, ix) = -t(jc, i)
-                  a(jx, ix + 1) = -t(j, ic)
-                  a(jx + 1, ix + 1) = t(jc, ic)
-              end do
-          end do
-      else
-          do i = 0, n2h - 1
-              ix = 2 * i
-              ic = n2 - 1 - i
-              do j = 0, n1h - 1
-                  jx = 2 * j
-                  jc = n1 - 1 - j
-                  t(j, i) = a(jx, ix)
-                  t(jc, i) = -a(jx + 1, ix)
-                  t(j, ic) = -a(jx, ix + 1)
-                  t(jc, ic) = a(jx + 1, ix + 1)
-              end do
-          end do
-          if (n1 .gt. 4) then
-              call bitrv2row(n1max, n1, n2, ip(2), t)
-          end if
-          call cftbrow(n1max, n1, n2, t, w)
-          if (n1 .gt. 4) then
-              call rftbrow(n1max, n1, n2, t, nc, w(nw))
-          end if
-          do i = 0, n2 - 1
-              xi = t(0, i) - t(1, i)
-              t(0, i) = t(0, i) + t(1, i)
-              t(1, i) = xi
-          end do
-          if (n2 .gt. 2) then
-              call bitrv2col(n1max, n1, n2, ip(2), t)
-          end if
-          call cftbcol(n1max, n1, n2, t, w)
-          call dstbsub(n1max, n1, n2, t, nc, w(nw))
-          do i = 0, n2 - 1
-              do j = 1, n1h - 1
-                  jx = 2 * j
-                  a(j, i) = t(jx, i)
-                  a(n1 - j, i) = t(jx + 1, i)
-              end do
-          end do
-          a(0, 0) = t(0, 0)
-          a(n1h, 0) = t(1, 0)
-          a(0, n2h) = t(0, n2h)
-          a(n1h, n2h) = t(1, n2h)
-          do i = 1, n2h - 1
-              ic = n2 - i
-              a(0, i) = t(0, i)
-              a(n1h, i) = t(1, ic)
-              a(0, ic) = t(1, i)
-              a(n1h, ic) = t(0, ic)
-          end do
-      end if
-      end
-!
-! -------- initializing routines --------
-!
-      subroutine makewt(nw, ip, w)
-      integer nw, ip(0 : *), nwh, j
-      real*8 w(0 : nw - 1), delta, x, y
-      ip(0) = nw
-      ip(1) = 1
-      if (nw .gt. 2) then
-          nwh = nw / 2
-          delta = atan(1.0d0) / nwh
-          w(0) = 1
-          w(1) = 0
-          w(nwh) = cos(delta * nwh)
-          w(nwh + 1) = w(nwh)
-          do j = 2, nwh - 2, 2
-              x = cos(delta * j)
-              y = sin(delta * j)
-              w(j) = x
-              w(j + 1) = y
-              w(nw - j) = y
-              w(nw - j + 1) = x
-          end do
-          call bitrv2(nw, ip(2), w)
-      end if
-      end
-!
-      subroutine makect(nc, ip, c)
-      integer nc, ip(0 : *), nch, j
-      real*8 c(0 : nc - 1), delta
-      ip(1) = nc
-      if (nc .gt. 1) then
-          nch = nc / 2
-          delta = atan(1.0d0) / nch
-          c(0) = 0.5d0
-          c(nch) = 0.5d0 * cos(delta * nch)
-          do j = 1, nch - 1
-              c(j) = 0.5d0 * cos(delta * j)
-              c(nc - j) = 0.5d0 * sin(delta * j)
-          end do
-      end if
-      end
-!
-! -------- child routines --------
-!
-      subroutine bitrv2(n, ip, a)
-      integer n, ip(0 : *), j, j1, k, k1, l, m, m2
-      real*8 a(0 : n - 1), xr, xi
-      ip(0) = 0
-      l = n
-      m = 1
-      do while (4 * m .lt. l)
-          l = l / 2
-          do j = 0, m - 1
-              ip(m + j) = ip(j) + l
-          end do
-          m = m * 2
-      end do
-      if (4 * m .gt. l) then
-          do k = 1, m - 1
-              do j = 0, k - 1
-                  j1 = 2 * j + ip(k)
-                  k1 = 2 * k + ip(j)
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  a(j1) = a(k1)
-                  a(j1 + 1) = a(k1 + 1)
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-              end do
-          end do
-      else
-          m2 = 2 * m
-          do k = 1, m - 1
-              do j = 0, k - 1
-                  j1 = 2 * j + ip(k)
-                  k1 = 2 * k + ip(j)
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  a(j1) = a(k1)
-                  a(j1 + 1) = a(k1 + 1)
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + m2
-                  k1 = k1 + m2
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  a(j1) = a(k1)
-                  a(j1 + 1) = a(k1 + 1)
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-              end do
-          end do
-      end if
-      end
-!
-      subroutine bitrv2row(n1max, n, n2, ip, a)
-      integer n1max, n, n2, ip(0 : *), i, j, j1, k, k1, l, m, m2
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), xr, xi
-      ip(0) = 0
-      l = n
-      m = 1
-      do while (4 * m .lt. l)
-          l = l / 2
-          do j = 0, m - 1
-              ip(m + j) = ip(j) + l
-          end do
-          m = m * 2
-      end do
-      if (4 * m .gt. l) then
-          do i = 0, n2 - 1
-              do k = 1, m - 1
-                  do j = 0, k - 1
-                      j1 = 2 * j + ip(k)
-                      k1 = 2 * k + ip(j)
-                      xr = a(j1, i)
-                      xi = a(j1 + 1, i)
-                      a(j1, i) = a(k1, i)
-                      a(j1 + 1, i) = a(k1 + 1, i)
-                      a(k1, i) = xr
-                      a(k1 + 1, i) = xi
-                  end do
-              end do
-          end do
-      else
-          m2 = 2 * m
-          do i = 0, n2 - 1
-              do k = 1, m - 1
-                  do j = 0, k - 1
-                      j1 = 2 * j + ip(k)
-                      k1 = 2 * k + ip(j)
-                      xr = a(j1, i)
-                      xi = a(j1 + 1, i)
-                      a(j1, i) = a(k1, i)
-                      a(j1 + 1, i) = a(k1 + 1, i)
-                      a(k1, i) = xr
-                      a(k1 + 1, i) = xi
-                      j1 = j1 + m2
-                      k1 = k1 + m2
-                      xr = a(j1, i)
-                      xi = a(j1 + 1, i)
-                      a(j1, i) = a(k1, i)
-                      a(j1 + 1, i) = a(k1 + 1, i)
-                      a(k1, i) = xr
-                      a(k1 + 1, i) = xi
-                  end do
-              end do
-          end do
-      end if
-      end
-!
-      subroutine bitrv2col(n1max, n1, n, ip, a)
-      integer n1max, n1, n, ip(0 : *), i, j, j1, k, k1, l, m
-      real*8 a(0 : n1max - 1, 0 : n - 1), xr, xi
-      ip(0) = 0
-      l = n
-      m = 1
-      do while (2 * m .lt. l)
-          l = l / 2
-          do j = 0, m - 1
-              ip(m + j) = ip(j) + l
-          end do
-          m = m * 2
-      end do
-      if (2 * m .gt. l) then
-          do k = 1, m - 1
-              do j = 0, k - 1
-                  j1 = j + ip(k)
-                  k1 = k + ip(j)
-                  do i = 0, n1 - 2, 2
-                      xr = a(i, j1)
-                      xi = a(i + 1, j1)
-                      a(i, j1) = a(i, k1)
-                      a(i + 1, j1) = a(i + 1, k1)
-                      a(i, k1) = xr
-                      a(i + 1, k1) = xi
-                  end do
-              end do
-          end do
-      else
-          do k = 1, m - 1
-              do j = 0, k - 1
-                  j1 = j + ip(k)
-                  k1 = k + ip(j)
-                  do i = 0, n1 - 2, 2
-                      xr = a(i, j1)
-                      xi = a(i + 1, j1)
-                      a(i, j1) = a(i, k1)
-                      a(i + 1, j1) = a(i + 1, k1)
-                      a(i, k1) = xr
-                      a(i + 1, k1) = xi
-                  end do
-                  j1 = j1 + m
-                  k1 = k1 + m
-                  do i = 0, n1 - 2, 2
-                      xr = a(i, j1)
-                      xi = a(i + 1, j1)
-                      a(i, j1) = a(i, k1)
-                      a(i + 1, j1) = a(i + 1, k1)
-                      a(i, k1) = xr
-                      a(i + 1, k1) = xi
-                  end do
-              end do
-          end do
-      end if
-      end
-!
-      subroutine cftbrow(n1max, n, n2, a, w)
-      integer n1max, n, n2, i, j, j1, j2, j3, k, k1, ks, l, m
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
-      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      do i = 0, n2 - 1
-          l = 2
-          do while (2 * l .lt. n)
-              m = 4 * l
-              do j = 0, l - 2, 2
-                  j1 = j + l
-                  j2 = j1 + l
-                  j3 = j2 + l
-                  x0r = a(j, i) + a(j1, i)
-                  x0i = a(j + 1, i) + a(j1 + 1, i)
-                  x1r = a(j, i) - a(j1, i)
-                  x1i = a(j + 1, i) - a(j1 + 1, i)
-                  x2r = a(j2, i) + a(j3, i)
-                  x2i = a(j2 + 1, i) + a(j3 + 1, i)
-                  x3r = a(j2, i) - a(j3, i)
-                  x3i = a(j2 + 1, i) - a(j3 + 1, i)
-                  a(j, i) = x0r + x2r
-                  a(j + 1, i) = x0i + x2i
-                  a(j2, i) = x0r - x2r
-                  a(j2 + 1, i) = x0i - x2i
-                  a(j1, i) = x1r - x3i
-                  a(j1 + 1, i) = x1i + x3r
-                  a(j3, i) = x1r + x3i
-                  a(j3 + 1, i) = x1i - x3r
-              end do
-              if (m .lt. n) then
-                  wk1r = w(2)
-                  do j = m, l + m - 2, 2
-                      j1 = j + l
-                      j2 = j1 + l
-                      j3 = j2 + l
-                      x0r = a(j, i) + a(j1, i)
-                      x0i = a(j + 1, i) + a(j1 + 1, i)
-                      x1r = a(j, i) - a(j1, i)
-                      x1i = a(j + 1, i) - a(j1 + 1, i)
-                      x2r = a(j2, i) + a(j3, i)
-                      x2i = a(j2 + 1, i) + a(j3 + 1, i)
-                      x3r = a(j2, i) - a(j3, i)
-                      x3i = a(j2 + 1, i) - a(j3 + 1, i)
-                      a(j, i) = x0r + x2r
-                      a(j + 1, i) = x0i + x2i
-                      a(j2, i) = x2i - x0i
-                      a(j2 + 1, i) = x0r - x2r
-                      x0r = x1r - x3i
-                      x0i = x1i + x3r
-                      a(j1, i) = wk1r * (x0r - x0i)
-                      a(j1 + 1, i) = wk1r * (x0r + x0i)
-                      x0r = x3i + x1r
-                      x0i = x3r - x1i
-                      a(j3, i) = wk1r * (x0i - x0r)
-                      a(j3 + 1, i) = wk1r * (x0i + x0r)
-                  end do
-                  k1 = 1
-                  ks = -1
-                  do k = 2 * m, n - m, m
-                      k1 = k1 + 1
-                      ks = -ks
-                      wk1r = w(2 * k1)
-                      wk1i = w(2 * k1 + 1)
-                      wk2r = ks * w(k1)
-                      wk2i = w(k1 + ks)
-                      wk3r = wk1r - 2 * wk2i * wk1i
-                      wk3i = 2 * wk2i * wk1r - wk1i
-                      do j = k, l + k - 2, 2
-                          j1 = j + l
-                          j2 = j1 + l
-                          j3 = j2 + l
-                          x0r = a(j, i) + a(j1, i)
-                          x0i = a(j + 1, i) + a(j1 + 1, i)
-                          x1r = a(j, i) - a(j1, i)
-                          x1i = a(j + 1, i) - a(j1 + 1, i)
-                          x2r = a(j2, i) + a(j3, i)
-                          x2i = a(j2 + 1, i) + a(j3 + 1, i)
-                          x3r = a(j2, i) - a(j3, i)
-                          x3i = a(j2 + 1, i) - a(j3 + 1, i)
-                          a(j, i) = x0r + x2r
-                          a(j + 1, i) = x0i + x2i
-                          x0r = x0r - x2r
-                          x0i = x0i - x2i
-                          a(j2, i) = wk2r * x0r - wk2i * x0i
-                          a(j2 + 1, i) = wk2r * x0i + wk2i * x0r
-                          x0r = x1r - x3i
-                          x0i = x1i + x3r
-                          a(j1, i) = wk1r * x0r - wk1i * x0i
-                          a(j1 + 1, i) = wk1r * x0i + wk1i * x0r
-                          x0r = x1r + x3i
-                          x0i = x1i - x3r
-                          a(j3, i) = wk3r * x0r - wk3i * x0i
-                          a(j3 + 1, i) = wk3r * x0i + wk3i * x0r
-                      end do
-                  end do
-              end if
-              l = m
-          end do
-          if (l .lt. n) then
-              do j = 0, l - 2, 2
-                  j1 = j + l
-                  x0r = a(j, i) - a(j1, i)
-                  x0i = a(j + 1, i) - a(j1 + 1, i)
-                  a(j, i) = a(j, i) + a(j1, i)
-                  a(j + 1, i) = a(j + 1, i) + a(j1 + 1, i)
-                  a(j1, i) = x0r
-                  a(j1 + 1, i) = x0i
-              end do
-          end if
-      end do
-      end
-!
-      subroutine cftbcol(n1max, n1, n, a, w)
-      integer n1max, n1, n, i, j, j1, j2, j3, k, k1, ks, l, m
-      real*8 a(0 : n1max - 1, 0 : n - 1), w(0 : *)
-      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      l = 1
-      do while (2 * l .lt. n)
-          m = 4 * l
-          do j = 0, l - 1
-              j1 = j + l
-              j2 = j1 + l
-              j3 = j2 + l
-              do i = 0, n1 - 2, 2
-                  x0r = a(i, j) + a(i, j1)
-                  x0i = a(i + 1, j) + a(i + 1, j1)
-                  x1r = a(i, j) - a(i, j1)
-                  x1i = a(i + 1, j) - a(i + 1, j1)
-                  x2r = a(i, j2) + a(i, j3)
-                  x2i = a(i + 1, j2) + a(i + 1, j3)
-                  x3r = a(i, j2) - a(i, j3)
-                  x3i = a(i + 1, j2) - a(i + 1, j3)
-                  a(i, j) = x0r + x2r
-                  a(i + 1, j) = x0i + x2i
-                  a(i, j2) = x0r - x2r
-                  a(i + 1, j2) = x0i - x2i
-                  a(i, j1) = x1r - x3i
-                  a(i + 1, j1) = x1i + x3r
-                  a(i, j3) = x1r + x3i
-                  a(i + 1, j3) = x1i - x3r
-              end do
-          end do
-          if (m .lt. n) then
-              wk1r = w(2)
-              do j = m, l + m - 1
-                  j1 = j + l
-                  j2 = j1 + l
-                  j3 = j2 + l
-                  do i = 0, n1 - 2, 2
-                      x0r = a(i, j) + a(i, j1)
-                      x0i = a(i + 1, j) + a(i + 1, j1)
-                      x1r = a(i, j) - a(i, j1)
-                      x1i = a(i + 1, j) - a(i + 1, j1)
-                      x2r = a(i, j2) + a(i, j3)
-                      x2i = a(i + 1, j2) + a(i + 1, j3)
-                      x3r = a(i, j2) - a(i, j3)
-                      x3i = a(i + 1, j2) - a(i + 1, j3)
-                      a(i, j) = x0r + x2r
-                      a(i + 1, j) = x0i + x2i
-                      a(i, j2) = x2i - x0i
-                      a(i + 1, j2) = x0r - x2r
-                      x0r = x1r - x3i
-                      x0i = x1i + x3r
-                      a(i, j1) = wk1r * (x0r - x0i)
-                      a(i + 1, j1) = wk1r * (x0r + x0i)
-                      x0r = x3i + x1r
-                      x0i = x3r - x1i
-                      a(i, j3) = wk1r * (x0i - x0r)
-                      a(i + 1, j3) = wk1r * (x0i + x0r)
-                  end do
-              end do
-              k1 = 1
-              ks = -1
-              do k = 2 * m, n - m, m
-                  k1 = k1 + 1
-                  ks = -ks
-                  wk1r = w(2 * k1)
-                  wk1i = w(2 * k1 + 1)
-                  wk2r = ks * w(k1)
-                  wk2i = w(k1 + ks)
-                  wk3r = wk1r - 2 * wk2i * wk1i
-                  wk3i = 2 * wk2i * wk1r - wk1i
-                  do j = k, l + k - 1
-                      j1 = j + l
-                      j2 = j1 + l
-                      j3 = j2 + l
-                      do i = 0, n1 - 2, 2
-                          x0r = a(i, j) + a(i, j1)
-                          x0i = a(i + 1, j) + a(i + 1, j1)
-                          x1r = a(i, j) - a(i, j1)
-                          x1i = a(i + 1, j) - a(i + 1, j1)
-                          x2r = a(i, j2) + a(i, j3)
-                          x2i = a(i + 1, j2) + a(i + 1, j3)
-                          x3r = a(i, j2) - a(i, j3)
-                          x3i = a(i + 1, j2) - a(i + 1, j3)
-                          a(i, j) = x0r + x2r
-                          a(i + 1, j) = x0i + x2i
-                          x0r = x0r - x2r
-                          x0i = x0i - x2i
-                          a(i, j2) = wk2r * x0r - wk2i * x0i
-                          a(i + 1, j2) = wk2r * x0i + wk2i * x0r
-                          x0r = x1r - x3i
-                          x0i = x1i + x3r
-                          a(i, j1) = wk1r * x0r - wk1i * x0i
-                          a(i + 1, j1) = wk1r * x0i + wk1i * x0r
-                          x0r = x1r + x3i
-                          x0i = x1i - x3r
-                          a(i, j3) = wk3r * x0r - wk3i * x0i
-                          a(i + 1, j3) = wk3r * x0i + wk3i * x0r
-                      end do
-                  end do
-              end do
-          end if
-          l = m
-      end do
-      if (l .lt. n) then
-          do j = 0, l - 1
-              j1 = j + l
-              do i = 0, n1 - 2, 2
-                  x0r = a(i, j) - a(i, j1)
-                  x0i = a(i + 1, j) - a(i + 1, j1)
-                  a(i, j) = a(i, j) + a(i, j1)
-                  a(i + 1, j) = a(i + 1, j) + a(i + 1, j1)
-                  a(i, j1) = x0r
-                  a(i + 1, j1) = x0i
-              end do
-          end do
-      end if
-      end
-!
-      subroutine cftfrow(n1max, n, n2, a, w)
-      integer n1max, n, n2, i, j, j1, j2, j3, k, k1, ks, l, m
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
-      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      do i = 0, n2 - 1
-          l = 2
-          do while (2 * l .lt. n)
-              m = 4 * l
-              do j = 0, l - 2, 2
-                  j1 = j + l
-                  j2 = j1 + l
-                  j3 = j2 + l
-                  x0r = a(j, i) + a(j1, i)
-                  x0i = a(j + 1, i) + a(j1 + 1, i)
-                  x1r = a(j, i) - a(j1, i)
-                  x1i = a(j + 1, i) - a(j1 + 1, i)
-                  x2r = a(j2, i) + a(j3, i)
-                  x2i = a(j2 + 1, i) + a(j3 + 1, i)
-                  x3r = a(j2, i) - a(j3, i)
-                  x3i = a(j2 + 1, i) - a(j3 + 1, i)
-                  a(j, i) = x0r + x2r
-                  a(j + 1, i) = x0i + x2i
-                  a(j2, i) = x0r - x2r
-                  a(j2 + 1, i) = x0i - x2i
-                  a(j1, i) = x1r + x3i
-                  a(j1 + 1, i) = x1i - x3r
-                  a(j3, i) = x1r - x3i
-                  a(j3 + 1, i) = x1i + x3r
-              end do
-              if (m .lt. n) then
-                  wk1r = w(2)
-                  do j = m, l + m - 2, 2
-                      j1 = j + l
-                      j2 = j1 + l
-                      j3 = j2 + l
-                      x0r = a(j, i) + a(j1, i)
-                      x0i = a(j + 1, i) + a(j1 + 1, i)
-                      x1r = a(j, i) - a(j1, i)
-                      x1i = a(j + 1, i) - a(j1 + 1, i)
-                      x2r = a(j2, i) + a(j3, i)
-                      x2i = a(j2 + 1, i) + a(j3 + 1, i)
-                      x3r = a(j2, i) - a(j3, i)
-                      x3i = a(j2 + 1, i) - a(j3 + 1, i)
-                      a(j, i) = x0r + x2r
-                      a(j + 1, i) = x0i + x2i
-                      a(j2, i) = x0i - x2i
-                      a(j2 + 1, i) = x2r - x0r
-                      x0r = x1r + x3i
-                      x0i = x1i - x3r
-                      a(j1, i) = wk1r * (x0i + x0r)
-                      a(j1 + 1, i) = wk1r * (x0i - x0r)
-                      x0r = x3i - x1r
-                      x0i = x3r + x1i
-                      a(j3, i) = wk1r * (x0r + x0i)
-                      a(j3 + 1, i) = wk1r * (x0r - x0i)
-                  end do
-                  k1 = 1
-                  ks = -1
-                  do k = 2 * m, n - m, m
-                      k1 = k1 + 1
-                      ks = -ks
-                      wk1r = w(2 * k1)
-                      wk1i = w(2 * k1 + 1)
-                      wk2r = ks * w(k1)
-                      wk2i = w(k1 + ks)
-                      wk3r = wk1r - 2 * wk2i * wk1i
-                      wk3i = 2 * wk2i * wk1r - wk1i
-                      do j = k, l + k - 2, 2
-                          j1 = j + l
-                          j2 = j1 + l
-                          j3 = j2 + l
-                          x0r = a(j, i) + a(j1, i)
-                          x0i = a(j + 1, i) + a(j1 + 1, i)
-                          x1r = a(j, i) - a(j1, i)
-                          x1i = a(j + 1, i) - a(j1 + 1, i)
-                          x2r = a(j2, i) + a(j3, i)
-                          x2i = a(j2 + 1, i) + a(j3 + 1, i)
-                          x3r = a(j2, i) - a(j3, i)
-                          x3i = a(j2 + 1, i) - a(j3 + 1, i)
-                          a(j, i) = x0r + x2r
-                          a(j + 1, i) = x0i + x2i
-                          x0r = x0r - x2r
-                          x0i = x0i - x2i
-                          a(j2, i) = wk2r * x0r + wk2i * x0i
-                          a(j2 + 1, i) = wk2r * x0i - wk2i * x0r
-                          x0r = x1r + x3i
-                          x0i = x1i - x3r
-                          a(j1, i) = wk1r * x0r + wk1i * x0i
-                          a(j1 + 1, i) = wk1r * x0i - wk1i * x0r
-                          x0r = x1r - x3i
-                          x0i = x1i + x3r
-                          a(j3, i) = wk3r * x0r + wk3i * x0i
-                          a(j3 + 1, i) = wk3r * x0i - wk3i * x0r
-                      end do
-                  end do
-              end if
-              l = m
-          end do
-          if (l .lt. n) then
-              do j = 0, l - 2, 2
-                  j1 = j + l
-                  x0r = a(j, i) - a(j1, i)
-                  x0i = a(j + 1, i) - a(j1 + 1, i)
-                  a(j, i) = a(j, i) + a(j1, i)
-                  a(j + 1, i) = a(j + 1, i) + a(j1 + 1, i)
-                  a(j1, i) = x0r
-                  a(j1 + 1, i) = x0i
-              end do
-          end if
-      end do
-      end
-!
-      subroutine cftfcol(n1max, n1, n, a, w)
-      integer n1max, n1, n, i, j, j1, j2, j3, k, k1, ks, l, m
-      real*8 a(0 : n1max - 1, 0 : n - 1), w(0 : *)
-      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      l = 1
-      do while (2 * l .lt. n)
-          m = 4 * l
-          do j = 0, l - 1
-              j1 = j + l
-              j2 = j1 + l
-              j3 = j2 + l
-              do i = 0, n1 - 2, 2
-                  x0r = a(i, j) + a(i, j1)
-                  x0i = a(i + 1, j) + a(i + 1, j1)
-                  x1r = a(i, j) - a(i, j1)
-                  x1i = a(i + 1, j) - a(i + 1, j1)
-                  x2r = a(i, j2) + a(i, j3)
-                  x2i = a(i + 1, j2) + a(i + 1, j3)
-                  x3r = a(i, j2) - a(i, j3)
-                  x3i = a(i + 1, j2) - a(i + 1, j3)
-                  a(i, j) = x0r + x2r
-                  a(i + 1, j) = x0i + x2i
-                  a(i, j2) = x0r - x2r
-                  a(i + 1, j2) = x0i - x2i
-                  a(i, j1) = x1r + x3i
-                  a(i + 1, j1) = x1i - x3r
-                  a(i, j3) = x1r - x3i
-                  a(i + 1, j3) = x1i + x3r
-              end do
-          end do
-          if (m .lt. n) then
-              wk1r = w(2)
-              do j = m, l + m - 1
-                  j1 = j + l
-                  j2 = j1 + l
-                  j3 = j2 + l
-                  do i = 0, n1 - 2, 2
-                      x0r = a(i, j) + a(i, j1)
-                      x0i = a(i + 1, j) + a(i + 1, j1)
-                      x1r = a(i, j) - a(i, j1)
-                      x1i = a(i + 1, j) - a(i + 1, j1)
-                      x2r = a(i, j2) + a(i, j3)
-                      x2i = a(i + 1, j2) + a(i + 1, j3)
-                      x3r = a(i, j2) - a(i, j3)
-                      x3i = a(i + 1, j2) - a(i + 1, j3)
-                      a(i, j) = x0r + x2r
-                      a(i + 1, j) = x0i + x2i
-                      a(i, j2) = x0i - x2i
-                      a(i + 1, j2) = x2r - x0r
-                      x0r = x1r + x3i
-                      x0i = x1i - x3r
-                      a(i, j1) = wk1r * (x0i + x0r)
-                      a(i + 1, j1) = wk1r * (x0i - x0r)
-                      x0r = x3i - x1r
-                      x0i = x3r + x1i
-                      a(i, j3) = wk1r * (x0r + x0i)
-                      a(i + 1, j3) = wk1r * (x0r - x0i)
-                  end do
-              end do
-              k1 = 1
-              ks = -1
-              do k = 2 * m, n - m, m
-                  k1 = k1 + 1
-                  ks = -ks
-                  wk1r = w(2 * k1)
-                  wk1i = w(2 * k1 + 1)
-                  wk2r = ks * w(k1)
-                  wk2i = w(k1 + ks)
-                  wk3r = wk1r - 2 * wk2i * wk1i
-                  wk3i = 2 * wk2i * wk1r - wk1i
-                  do j = k, l + k - 1
-                      j1 = j + l
-                      j2 = j1 + l
-                      j3 = j2 + l
-                      do i = 0, n1 - 2, 2
-                          x0r = a(i, j) + a(i, j1)
-                          x0i = a(i + 1, j) + a(i + 1, j1)
-                          x1r = a(i, j) - a(i, j1)
-                          x1i = a(i + 1, j) - a(i + 1, j1)
-                          x2r = a(i, j2) + a(i, j3)
-                          x2i = a(i + 1, j2) + a(i + 1, j3)
-                          x3r = a(i, j2) - a(i, j3)
-                          x3i = a(i + 1, j2) - a(i + 1, j3)
-                          a(i, j) = x0r + x2r
-                          a(i + 1, j) = x0i + x2i
-                          x0r = x0r - x2r
-                          x0i = x0i - x2i
-                          a(i, j2) = wk2r * x0r + wk2i * x0i
-                          a(i + 1, j2) = wk2r * x0i - wk2i * x0r
-                          x0r = x1r + x3i
-                          x0i = x1i - x3r
-                          a(i, j1) = wk1r * x0r + wk1i * x0i
-                          a(i + 1, j1) = wk1r * x0i - wk1i * x0r
-                          x0r = x1r - x3i
-                          x0i = x1i + x3r
-                          a(i, j3) = wk3r * x0r + wk3i * x0i
-                          a(i + 1, j3) = wk3r * x0i - wk3i * x0r
-                      end do
-                  end do
-              end do
-          end if
-          l = m
-      end do
-      if (l .lt. n) then
-          do j = 0, l - 1
-              j1 = j + l
-              do i = 0, n1 - 2, 2
-                  x0r = a(i, j) - a(i, j1)
-                  x0i = a(i + 1, j) - a(i + 1, j1)
-                  a(i, j) = a(i, j) + a(i, j1)
-                  a(i + 1, j) = a(i + 1, j) + a(i + 1, j1)
-                  a(i, j1) = x0r
-                  a(i + 1, j1) = x0i
-              end do
-          end do
-      end if
-      end
-!
-      subroutine rftbrow(n1max, n, n2, a, nc, c)
-      integer n1max, n, n2, nc, i, j, k, kk, ks
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
-     &    wkr, wki, xr, xi, yr, yi
-      ks = 4 * nc / n
-      do i = 0, n2 - 1
-          kk = 0
-          do k = n / 2 - 2, 2, -2
-              j = n - k
-              kk = kk + ks
-              wkr = 0.5d0 - c(kk)
-              wki = c(nc - kk)
-              xr = a(k, i) - a(j, i)
-              xi = a(k + 1, i) + a(j + 1, i)
-              yr = wkr * xr - wki * xi
-              yi = wkr * xi + wki * xr
-              a(k, i) = a(k, i) - yr
-              a(k + 1, i) = a(k + 1, i) - yi
-              a(j, i) = a(j, i) + yr
-              a(j + 1, i) = a(j + 1, i) - yi
-          end do
-      end do
-      end
-!
-      subroutine rftfrow(n1max, n, n2, a, nc, c)
-      integer n1max, n, n2, nc, i, j, k, kk, ks
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
-     &    wkr, wki, xr, xi, yr, yi
-      ks = 4 * nc / n
-      do i = 0, n2 - 1
-          kk = 0
-          do k = n / 2 - 2, 2, -2
-              j = n - k
-              kk = kk + ks
-              wkr = 0.5d0 - c(kk)
-              wki = c(nc - kk)
-              xr = a(k, i) - a(j, i)
-              xi = a(k + 1, i) + a(j + 1, i)
-              yr = wkr * xr + wki * xi
-              yi = wkr * xi - wki * xr
-              a(k, i) = a(k, i) - yr
-              a(k + 1, i) = a(k + 1, i) - yi
-              a(j, i) = a(j, i) + yr
-              a(j + 1, i) = a(j + 1, i) - yi
-          end do
-      end do
-      end
-!
-      subroutine dctbsub(n1max, n1, n2, a, nc, c)
-      integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2, 
-     &    k1, k2
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
-     &    w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
-      ks1 = nc / n1
-      ks2 = nc / n2
-      n2h = n2 / 2
-      kk2 = ks2
-      do k2 = 1, n2h - 1
-          j2 = n2 - k2
-          w2r = 2 * c(kk2)
-          w2i = 2 * c(nc - kk2)
-          kk2 = kk2 + ks2
-          kk1 = ks1
-          do k1 = 2, n1 - 2, 2
-              x0r = w2r * c(kk1)
-              x0i = w2i * c(kk1)
-              x1r = w2r * c(nc - kk1)
-              x1i = w2i * c(nc - kk1)
-              wkr = x0r - x1i
-              wki = x0i + x1r
-              wji = x0r + x1i
-              wjr = x0i - x1r
-              kk1 = kk1 + ks1
-              x0r = wkr * a(k1, k2) - wki * a(k1 + 1, k2)
-              x0i = wkr * a(k1 + 1, k2) + wki * a(k1, k2)
-              x1r = wjr * a(k1, j2) - wji * a(k1 + 1, j2)
-              x1i = wjr * a(k1 + 1, j2) + wji * a(k1, j2)
-              a(k1, k2) = x0r + x1i
-              a(k1 + 1, k2) = x0i - x1r
-              a(k1, j2) = x1r + x0i
-              a(k1 + 1, j2) = x1i - x0r
-          end do
-          wkr = w2r * 0.5d0
-          wki = w2i * 0.5d0
-          wjr = w2r * c(kk1)
-          wji = w2i * c(kk1)
-          x0r = a(0, k2) + a(0, j2)
-          x0i = a(1, k2) - a(1, j2)
-          x1r = a(0, k2) - a(0, j2)
-          x1i = a(1, k2) + a(1, j2)
-          a(0, k2) = wkr * x0r - wki * x0i
-          a(1, k2) = wkr * x0i + wki * x0r
-          a(0, j2) = -wjr * x1r + wji * x1i
-          a(1, j2) = wjr * x1i + wji * x1r
-      end do
-      w2r = 2 * c(kk2)
-      kk1 = ks1
-      do k1 = 2, n1 - 2, 2
-          wkr = 2 * c(kk1)
-          wki = 2 * c(nc - kk1)
-          wjr = w2r * wkr
-          wji = w2r * wki
-          kk1 = kk1 + ks1
-          x0i = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
-          a(k1, 0) = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
-          a(k1 + 1, 0) = x0i
-          x0i = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
-          a(k1, n2h) = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
-          a(k1 + 1, n2h) = x0i
-      end do
-      a(1, 0) = a(1, 0) * w2r
-      a(0, n2h) = a(0, n2h) * w2r
-      a(1, n2h) = a(1, n2h) * 0.5d0
-      end
-!
-      subroutine dctfsub(n1max, n1, n2, a, nc, c)
-      integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2, 
-     &    k1, k2
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
-     &    w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
-      ks1 = nc / n1
-      ks2 = nc / n2
-      n2h = n2 / 2
-      kk2 = ks2
-      do k2 = 1, n2h - 1
-          j2 = n2 - k2
-          w2r = 2 * c(kk2)
-          w2i = 2 * c(nc - kk2)
-          kk2 = kk2 + ks2
-          kk1 = ks1
-          do k1 = 2, n1 - 2, 2
-              x0r = w2r * c(kk1)
-              x0i = w2i * c(kk1)
-              x1r = w2r * c(nc - kk1)
-              x1i = w2i * c(nc - kk1)
-              wkr = x0r - x1i
-              wki = x0i + x1r
-              wji = x0r + x1i
-              wjr = x0i - x1r
-              kk1 = kk1 + ks1
-              x0r = a(k1, k2) - a(k1 + 1, j2)
-              x0i = a(k1, j2) + a(k1 + 1, k2)
-              x1r = a(k1, j2) - a(k1 + 1, k2)
-              x1i = a(k1, k2) + a(k1 + 1, j2)
-              a(k1, k2) = wkr * x0r + wki * x0i
-              a(k1 + 1, k2) = wkr * x0i - wki * x0r
-              a(k1, j2) = wjr * x1r + wji * x1i
-              a(k1 + 1, j2) = wjr * x1i - wji * x1r
-          end do
-          x0r = 2 * c(kk1)
-          wjr = x0r * w2r
-          wji = x0r * w2i
-          x0r = w2r * a(0, k2) + w2i * a(1, k2)
-          x0i = w2r * a(1, k2) - w2i * a(0, k2)
-          x1r = -wjr * a(0, j2) + wji * a(1, j2)
-          x1i = wjr * a(1, j2) + wji * a(0, j2)
-          a(0, k2) = x0r + x1r
-          a(1, k2) = x1i + x0i
-          a(0, j2) = x0r - x1r
-          a(1, j2) = x1i - x0i
-      end do
-      w2r = 2 * c(kk2)
-      kk1 = ks1
-      do k1 = 2, n1 - 2, 2
-          wkr = 2 * c(kk1)
-          wki = 2 * c(nc - kk1)
-          wjr = w2r * wkr
-          wji = w2r * wki
-          kk1 = kk1 + ks1
-          x0i = wkr * a(k1 + 1, 0) - wki * a(k1, 0)
-          a(k1, 0) = wkr * a(k1, 0) + wki * a(k1 + 1, 0)
-          a(k1 + 1, 0) = x0i
-          x0i = wjr * a(k1 + 1, n2h) - wji * a(k1, n2h)
-          a(k1, n2h) = wjr * a(k1, n2h) + wji * a(k1 + 1, n2h)
-          a(k1 + 1, n2h) = x0i
-      end do
-      w2r = w2r * 2
-      a(0, 0) = a(0, 0) * 2
-      a(1, 0) = a(1, 0) * w2r
-      a(0, n2h) = a(0, n2h) * w2r
-      end
-!
-      subroutine dstbsub(n1max, n1, n2, a, nc, c)
-      integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2, 
-     &    k1, k2
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
-     &    w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
-      ks1 = nc / n1
-      ks2 = nc / n2
-      n2h = n2 / 2
-      kk2 = ks2
-      do k2 = 1, n2h - 1
-          j2 = n2 - k2
-          w2r = 2 * c(kk2)
-          w2i = 2 * c(nc - kk2)
-          kk2 = kk2 + ks2
-          kk1 = ks1
-          do k1 = 2, n1 - 2, 2
-              x0r = w2r * c(kk1)
-              x0i = w2i * c(kk1)
-              x1r = w2r * c(nc - kk1)
-              x1i = w2i * c(nc - kk1)
-              wkr = x0r - x1i
-              wki = x0i + x1r
-              wji = x0r + x1i
-              wjr = x0i - x1r
-              kk1 = kk1 + ks1
-              x0r = wkr * a(k1, k2) - wki * a(k1 + 1, k2)
-              x0i = wkr * a(k1 + 1, k2) + wki * a(k1, k2)
-              x1r = wjr * a(k1, j2) - wji * a(k1 + 1, j2)
-              x1i = wjr * a(k1 + 1, j2) + wji * a(k1, j2)
-              a(k1, k2) = x1i - x0r
-              a(k1 + 1, k2) = x1r + x0i
-              a(k1, j2) = x0i - x1r
-              a(k1 + 1, j2) = x0r + x1i
-          end do
-          wkr = w2r * 0.5d0
-          wki = w2i * 0.5d0
-          wjr = w2r * c(kk1)
-          wji = w2i * c(kk1)
-          x0r = a(0, k2) + a(0, j2)
-          x0i = a(1, k2) - a(1, j2)
-          x1r = a(0, k2) - a(0, j2)
-          x1i = a(1, k2) + a(1, j2)
-          a(1, k2) = wkr * x0r - wki * x0i
-          a(0, k2) = wkr * x0i + wki * x0r
-          a(1, j2) = -wjr * x1r + wji * x1i
-          a(0, j2) = wjr * x1i + wji * x1r
-      end do
-      w2r = 2 * c(kk2)
-      kk1 = ks1
-      do k1 = 2, n1 - 2, 2
-          wkr = 2 * c(kk1)
-          wki = 2 * c(nc - kk1)
-          wjr = w2r * wkr
-          wji = w2r * wki
-          kk1 = kk1 + ks1
-          x0i = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
-          a(k1 + 1, 0) = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
-          a(k1, 0) = x0i
-          x0i = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
-          a(k1 + 1, n2h) = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
-          a(k1, n2h) = x0i
-      end do
-      a(1, 0) = a(1, 0) * w2r
-      a(0, n2h) = a(0, n2h) * w2r
-      a(1, n2h) = a(1, n2h) * 0.5d0
-      end
-!
-      subroutine dstfsub(n1max, n1, n2, a, nc, c)
-      integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2, 
-     &    k1, k2
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
-     &    w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
-      ks1 = nc / n1
-      ks2 = nc / n2
-      n2h = n2 / 2
-      kk2 = ks2
-      do k2 = 1, n2h - 1
-          j2 = n2 - k2
-          w2r = 2 * c(kk2)
-          w2i = 2 * c(nc - kk2)
-          kk2 = kk2 + ks2
-          kk1 = ks1
-          do k1 = 2, n1 - 2, 2
-              x0r = w2r * c(kk1)
-              x0i = w2i * c(kk1)
-              x1r = w2r * c(nc - kk1)
-              x1i = w2i * c(nc - kk1)
-              wkr = x0r - x1i
-              wki = x0i + x1r
-              wji = x0r + x1i
-              wjr = x0i - x1r
-              kk1 = kk1 + ks1
-              x0r = a(k1 + 1, j2) - a(k1, k2)
-              x0i = a(k1 + 1, k2) + a(k1, j2)
-              x1r = a(k1 + 1, k2) - a(k1, j2)
-              x1i = a(k1 + 1, j2) + a(k1, k2)
-              a(k1, k2) = wkr * x0r + wki * x0i
-              a(k1 + 1, k2) = wkr * x0i - wki * x0r
-              a(k1, j2) = wjr * x1r + wji * x1i
-              a(k1 + 1, j2) = wjr * x1i - wji * x1r
-          end do
-          x0r = 2 * c(kk1)
-          wjr = x0r * w2r
-          wji = x0r * w2i
-          x0r = w2r * a(1, k2) + w2i * a(0, k2)
-          x0i = w2r * a(0, k2) - w2i * a(1, k2)
-          x1r = -wjr * a(1, j2) + wji * a(0, j2)
-          x1i = wjr * a(0, j2) + wji * a(1, j2)
-          a(0, k2) = x0r + x1r
-          a(1, k2) = x1i + x0i
-          a(0, j2) = x0r - x1r
-          a(1, j2) = x1i - x0i
-      end do
-      w2r = 2 * c(kk2)
-      kk1 = ks1
-      do k1 = 2, n1 - 2, 2
-          wkr = 2 * c(kk1)
-          wki = 2 * c(nc - kk1)
-          wjr = w2r * wkr
-          wji = w2r * wki
-          kk1 = kk1 + ks1
-          x0i = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
-          a(k1, 0) = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
-          a(k1 + 1, 0) = x0i
-          x0i = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
-          a(k1, n2h) = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
-          a(k1 + 1, n2h) = x0i
-      end do
-      w2r = w2r * 2
-      a(0, 0) = a(0, 0) * 2
-      a(1, 0) = a(1, 0) * w2r
-      a(0, n2h) = a(0, n2h) * w2r
-      end
-!
diff --git a/third_party/tensorflow_dependencies/fft2d/fftsg.c b/third_party/tensorflow_dependencies/fft2d/fftsg.c
deleted file mode 100644
index 43d7534..0000000
--- a/third_party/tensorflow_dependencies/fft2d/fftsg.c
+++ /dev/null
@@ -1,3314 +0,0 @@
-/*
-Fast Fourier/Cosine/Sine Transform
-    dimension   :one
-    data length :power of 2
-    decimation  :frequency
-    radix       :split-radix
-    data        :inplace
-    table       :use
-functions
-    cdft: Complex Discrete Fourier Transform
-    rdft: Real Discrete Fourier Transform
-    ddct: Discrete Cosine Transform
-    ddst: Discrete Sine Transform
-    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
-    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
-function prototypes
-    void cdft(int, int, double *, int *, double *);
-    void rdft(int, int, double *, int *, double *);
-    void ddct(int, int, double *, int *, double *);
-    void ddst(int, int, double *, int *, double *);
-    void dfct(int, double *, double *, int *, double *);
-    void dfst(int, double *, double *, int *, double *);
-macro definitions
-    USE_CDFT_PTHREADS : default=not defined
-        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192
-        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
-    USE_CDFT_WINTHREADS : default=not defined
-        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768
-        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
-
-
--------- Complex DFT (Discrete Fourier Transform) --------
-    [definition]
-        <case1>
-            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
-        <case2>
-            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
-        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            cdft(2*n, 1, a, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            cdft(2*n, -1, a, ip, w);
-    [parameters]
-        2*n            :data length (int)
-                        n >= 1, n = power of 2
-        a[0...2*n-1]   :input/output data (double *)
-                        input data
-                            a[2*j] = Re(x[j]), 
-                            a[2*j+1] = Im(x[j]), 0<=j<n
-                        output data
-                            a[2*k] = Re(X[k]), 
-                            a[2*k+1] = Im(X[k]), 0<=k<n
-        ip[0...*]      :work area for bit reversal (int *)
-                        length of ip >= 2+sqrt(n)
-                        strictly, 
-                        length of ip >= 
-                            2+(1<<(int)(log(n+0.5)/log(2))/2).
-                        ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...n/2-1]   :cos/sin table (double *)
-                        w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            cdft(2*n, -1, a, ip, w);
-        is 
-            cdft(2*n, 1, a, ip, w);
-            for (j = 0; j <= 2 * n - 1; j++) {
-                a[j] *= 1.0 / n;
-            }
-        .
-
-
--------- Real DFT / Inverse of Real DFT --------
-    [definition]
-        <case1> RDFT
-            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
-            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
-        <case2> IRDFT (excluding scale)
-            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + 
-                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + 
-                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            rdft(n, 1, a, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            rdft(n, -1, a, ip, w);
-    [parameters]
-        n              :data length (int)
-                        n >= 2, n = power of 2
-        a[0...n-1]     :input/output data (double *)
-                        <case1>
-                            output data
-                                a[2*k] = R[k], 0<=k<n/2
-                                a[2*k+1] = I[k], 0<k<n/2
-                                a[1] = R[n/2]
-                        <case2>
-                            input data
-                                a[2*j] = R[j], 0<=j<n/2
-                                a[2*j+1] = I[j], 0<j<n/2
-                                a[1] = R[n/2]
-        ip[0...*]      :work area for bit reversal (int *)
-                        length of ip >= 2+sqrt(n/2)
-                        strictly, 
-                        length of ip >= 
-                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
-                        ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...n/2-1]   :cos/sin table (double *)
-                        w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            rdft(n, 1, a, ip, w);
-        is 
-            rdft(n, -1, a, ip, w);
-            for (j = 0; j <= n - 1; j++) {
-                a[j] *= 2.0 / n;
-            }
-        .
-
-
--------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
-    [definition]
-        <case1> IDCT (excluding scale)
-            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
-        <case2> DCT
-            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            ddct(n, 1, a, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            ddct(n, -1, a, ip, w);
-    [parameters]
-        n              :data length (int)
-                        n >= 2, n = power of 2
-        a[0...n-1]     :input/output data (double *)
-                        output data
-                            a[k] = C[k], 0<=k<n
-        ip[0...*]      :work area for bit reversal (int *)
-                        length of ip >= 2+sqrt(n/2)
-                        strictly, 
-                        length of ip >= 
-                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
-                        ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...n*5/4-1] :cos/sin table (double *)
-                        w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            ddct(n, -1, a, ip, w);
-        is 
-            a[0] *= 0.5;
-            ddct(n, 1, a, ip, w);
-            for (j = 0; j <= n - 1; j++) {
-                a[j] *= 2.0 / n;
-            }
-        .
-
-
--------- DST (Discrete Sine Transform) / Inverse of DST --------
-    [definition]
-        <case1> IDST (excluding scale)
-            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
-        <case2> DST
-            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            ddst(n, 1, a, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            ddst(n, -1, a, ip, w);
-    [parameters]
-        n              :data length (int)
-                        n >= 2, n = power of 2
-        a[0...n-1]     :input/output data (double *)
-                        <case1>
-                            input data
-                                a[j] = A[j], 0<j<n
-                                a[0] = A[n]
-                            output data
-                                a[k] = S[k], 0<=k<n
-                        <case2>
-                            output data
-                                a[k] = S[k], 0<k<n
-                                a[0] = S[n]
-        ip[0...*]      :work area for bit reversal (int *)
-                        length of ip >= 2+sqrt(n/2)
-                        strictly, 
-                        length of ip >= 
-                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
-                        ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...n*5/4-1] :cos/sin table (double *)
-                        w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            ddst(n, -1, a, ip, w);
-        is 
-            a[0] *= 0.5;
-            ddst(n, 1, a, ip, w);
-            for (j = 0; j <= n - 1; j++) {
-                a[j] *= 2.0 / n;
-            }
-        .
-
-
--------- Cosine Transform of RDFT (Real Symmetric DFT) --------
-    [definition]
-        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
-    [usage]
-        ip[0] = 0; // first time only
-        dfct(n, a, t, ip, w);
-    [parameters]
-        n              :data length - 1 (int)
-                        n >= 2, n = power of 2
-        a[0...n]       :input/output data (double *)
-                        output data
-                            a[k] = C[k], 0<=k<=n
-        t[0...n/2]     :work area (double *)
-        ip[0...*]      :work area for bit reversal (int *)
-                        length of ip >= 2+sqrt(n/4)
-                        strictly, 
-                        length of ip >= 
-                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
-                        ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...n*5/8-1] :cos/sin table (double *)
-                        w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            a[0] *= 0.5;
-            a[n] *= 0.5;
-            dfct(n, a, t, ip, w);
-        is 
-            a[0] *= 0.5;
-            a[n] *= 0.5;
-            dfct(n, a, t, ip, w);
-            for (j = 0; j <= n; j++) {
-                a[j] *= 2.0 / n;
-            }
-        .
-
-
--------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
-    [definition]
-        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
-    [usage]
-        ip[0] = 0; // first time only
-        dfst(n, a, t, ip, w);
-    [parameters]
-        n              :data length + 1 (int)
-                        n >= 2, n = power of 2
-        a[0...n-1]     :input/output data (double *)
-                        output data
-                            a[k] = S[k], 0<k<n
-                        (a[0] is used for work area)
-        t[0...n/2-1]   :work area (double *)
-        ip[0...*]      :work area for bit reversal (int *)
-                        length of ip >= 2+sqrt(n/4)
-                        strictly, 
-                        length of ip >= 
-                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
-                        ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...n*5/8-1] :cos/sin table (double *)
-                        w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            dfst(n, a, t, ip, w);
-        is 
-            dfst(n, a, t, ip, w);
-            for (j = 1; j <= n - 1; j++) {
-                a[j] *= 2.0 / n;
-            }
-        .
-
-
-Appendix :
-    The cos/sin table is recalculated when the larger table required.
-    w[] and ip[] are compatible with all routines.
-*/
-
-
-void cdft(int n, int isgn, double *a, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void cftfsub(int n, double *a, int *ip, int nw, double *w);
-    void cftbsub(int n, double *a, int *ip, int nw, double *w);
-    int nw;
-    
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    if (isgn >= 0) {
-        cftfsub(n, a, ip, nw, w);
-    } else {
-        cftbsub(n, a, ip, nw, w);
-    }
-}
-
-
-void rdft(int n, int isgn, double *a, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void cftfsub(int n, double *a, int *ip, int nw, double *w);
-    void cftbsub(int n, double *a, int *ip, int nw, double *w);
-    void rftfsub(int n, double *a, int nc, double *c);
-    void rftbsub(int n, double *a, int nc, double *c);
-    int nw, nc;
-    double xi;
-    
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > (nc << 2)) {
-        nc = n >> 2;
-        makect(nc, ip, w + nw);
-    }
-    if (isgn >= 0) {
-        if (n > 4) {
-            cftfsub(n, a, ip, nw, w);
-            rftfsub(n, a, nc, w + nw);
-        } else if (n == 4) {
-            cftfsub(n, a, ip, nw, w);
-        }
-        xi = a[0] - a[1];
-        a[0] += a[1];
-        a[1] = xi;
-    } else {
-        a[1] = 0.5 * (a[0] - a[1]);
-        a[0] -= a[1];
-        if (n > 4) {
-            rftbsub(n, a, nc, w + nw);
-            cftbsub(n, a, ip, nw, w);
-        } else if (n == 4) {
-            cftbsub(n, a, ip, nw, w);
-        }
-    }
-}
-
-
-void ddct(int n, int isgn, double *a, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void cftfsub(int n, double *a, int *ip, int nw, double *w);
-    void cftbsub(int n, double *a, int *ip, int nw, double *w);
-    void rftfsub(int n, double *a, int nc, double *c);
-    void rftbsub(int n, double *a, int nc, double *c);
-    void dctsub(int n, double *a, int nc, double *c);
-    int j, nw, nc;
-    double xr;
-    
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > nc) {
-        nc = n;
-        makect(nc, ip, w + nw);
-    }
-    if (isgn < 0) {
-        xr = a[n - 1];
-        for (j = n - 2; j >= 2; j -= 2) {
-            a[j + 1] = a[j] - a[j - 1];
-            a[j] += a[j - 1];
-        }
-        a[1] = a[0] - xr;
-        a[0] += xr;
-        if (n > 4) {
-            rftbsub(n, a, nc, w + nw);
-            cftbsub(n, a, ip, nw, w);
-        } else if (n == 4) {
-            cftbsub(n, a, ip, nw, w);
-        }
-    }
-    dctsub(n, a, nc, w + nw);
-    if (isgn >= 0) {
-        if (n > 4) {
-            cftfsub(n, a, ip, nw, w);
-            rftfsub(n, a, nc, w + nw);
-        } else if (n == 4) {
-            cftfsub(n, a, ip, nw, w);
-        }
-        xr = a[0] - a[1];
-        a[0] += a[1];
-        for (j = 2; j < n; j += 2) {
-            a[j - 1] = a[j] - a[j + 1];
-            a[j] += a[j + 1];
-        }
-        a[n - 1] = xr;
-    }
-}
-
-
-void ddst(int n, int isgn, double *a, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void cftfsub(int n, double *a, int *ip, int nw, double *w);
-    void cftbsub(int n, double *a, int *ip, int nw, double *w);
-    void rftfsub(int n, double *a, int nc, double *c);
-    void rftbsub(int n, double *a, int nc, double *c);
-    void dstsub(int n, double *a, int nc, double *c);
-    int j, nw, nc;
-    double xr;
-    
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > nc) {
-        nc = n;
-        makect(nc, ip, w + nw);
-    }
-    if (isgn < 0) {
-        xr = a[n - 1];
-        for (j = n - 2; j >= 2; j -= 2) {
-            a[j + 1] = -a[j] - a[j - 1];
-            a[j] -= a[j - 1];
-        }
-        a[1] = a[0] + xr;
-        a[0] -= xr;
-        if (n > 4) {
-            rftbsub(n, a, nc, w + nw);
-            cftbsub(n, a, ip, nw, w);
-        } else if (n == 4) {
-            cftbsub(n, a, ip, nw, w);
-        }
-    }
-    dstsub(n, a, nc, w + nw);
-    if (isgn >= 0) {
-        if (n > 4) {
-            cftfsub(n, a, ip, nw, w);
-            rftfsub(n, a, nc, w + nw);
-        } else if (n == 4) {
-            cftfsub(n, a, ip, nw, w);
-        }
-        xr = a[0] - a[1];
-        a[0] += a[1];
-        for (j = 2; j < n; j += 2) {
-            a[j - 1] = -a[j] - a[j + 1];
-            a[j] -= a[j + 1];
-        }
-        a[n - 1] = -xr;
-    }
-}
-
-
-void dfct(int n, double *a, double *t, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void cftfsub(int n, double *a, int *ip, int nw, double *w);
-    void rftfsub(int n, double *a, int nc, double *c);
-    void dctsub(int n, double *a, int nc, double *c);
-    int j, k, l, m, mh, nw, nc;
-    double xr, xi, yr, yi;
-    
-    nw = ip[0];
-    if (n > (nw << 3)) {
-        nw = n >> 3;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > (nc << 1)) {
-        nc = n >> 1;
-        makect(nc, ip, w + nw);
-    }
-    m = n >> 1;
-    yi = a[m];
-    xi = a[0] + a[n];
-    a[0] -= a[n];
-    t[0] = xi - yi;
-    t[m] = xi + yi;
-    if (n > 2) {
-        mh = m >> 1;
-        for (j = 1; j < mh; j++) {
-            k = m - j;
-            xr = a[j] - a[n - j];
-            xi = a[j] + a[n - j];
-            yr = a[k] - a[n - k];
-            yi = a[k] + a[n - k];
-            a[j] = xr;
-            a[k] = yr;
-            t[j] = xi - yi;
-            t[k] = xi + yi;
-        }
-        t[mh] = a[mh] + a[n - mh];
-        a[mh] -= a[n - mh];
-        dctsub(m, a, nc, w + nw);
-        if (m > 4) {
-            cftfsub(m, a, ip, nw, w);
-            rftfsub(m, a, nc, w + nw);
-        } else if (m == 4) {
-            cftfsub(m, a, ip, nw, w);
-        }
-        a[n - 1] = a[0] - a[1];
-        a[1] = a[0] + a[1];
-        for (j = m - 2; j >= 2; j -= 2) {
-            a[2 * j + 1] = a[j] + a[j + 1];
-            a[2 * j - 1] = a[j] - a[j + 1];
-        }
-        l = 2;
-        m = mh;
-        while (m >= 2) {
-            dctsub(m, t, nc, w + nw);
-            if (m > 4) {
-                cftfsub(m, t, ip, nw, w);
-                rftfsub(m, t, nc, w + nw);
-            } else if (m == 4) {
-                cftfsub(m, t, ip, nw, w);
-            }
-            a[n - l] = t[0] - t[1];
-            a[l] = t[0] + t[1];
-            k = 0;
-            for (j = 2; j < m; j += 2) {
-                k += l << 2;
-                a[k - l] = t[j] - t[j + 1];
-                a[k + l] = t[j] + t[j + 1];
-            }
-            l <<= 1;
-            mh = m >> 1;
-            for (j = 0; j < mh; j++) {
-                k = m - j;
-                t[j] = t[m + k] - t[m + j];
-                t[k] = t[m + k] + t[m + j];
-            }
-            t[mh] = t[m + mh];
-            m = mh;
-        }
-        a[l] = t[0];
-        a[n] = t[2] - t[1];
-        a[0] = t[2] + t[1];
-    } else {
-        a[1] = a[0];
-        a[2] = t[0];
-        a[0] = t[1];
-    }
-}
-
-
-void dfst(int n, double *a, double *t, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void cftfsub(int n, double *a, int *ip, int nw, double *w);
-    void rftfsub(int n, double *a, int nc, double *c);
-    void dstsub(int n, double *a, int nc, double *c);
-    int j, k, l, m, mh, nw, nc;
-    double xr, xi, yr, yi;
-    
-    nw = ip[0];
-    if (n > (nw << 3)) {
-        nw = n >> 3;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > (nc << 1)) {
-        nc = n >> 1;
-        makect(nc, ip, w + nw);
-    }
-    if (n > 2) {
-        m = n >> 1;
-        mh = m >> 1;
-        for (j = 1; j < mh; j++) {
-            k = m - j;
-            xr = a[j] + a[n - j];
-            xi = a[j] - a[n - j];
-            yr = a[k] + a[n - k];
-            yi = a[k] - a[n - k];
-            a[j] = xr;
-            a[k] = yr;
-            t[j] = xi + yi;
-            t[k] = xi - yi;
-        }
-        t[0] = a[mh] - a[n - mh];
-        a[mh] += a[n - mh];
-        a[0] = a[m];
-        dstsub(m, a, nc, w + nw);
-        if (m > 4) {
-            cftfsub(m, a, ip, nw, w);
-            rftfsub(m, a, nc, w + nw);
-        } else if (m == 4) {
-            cftfsub(m, a, ip, nw, w);
-        }
-        a[n - 1] = a[1] - a[0];
-        a[1] = a[0] + a[1];
-        for (j = m - 2; j >= 2; j -= 2) {
-            a[2 * j + 1] = a[j] - a[j + 1];
-            a[2 * j - 1] = -a[j] - a[j + 1];
-        }
-        l = 2;
-        m = mh;
-        while (m >= 2) {
-            dstsub(m, t, nc, w + nw);
-            if (m > 4) {
-                cftfsub(m, t, ip, nw, w);
-                rftfsub(m, t, nc, w + nw);
-            } else if (m == 4) {
-                cftfsub(m, t, ip, nw, w);
-            }
-            a[n - l] = t[1] - t[0];
-            a[l] = t[0] + t[1];
-            k = 0;
-            for (j = 2; j < m; j += 2) {
-                k += l << 2;
-                a[k - l] = -t[j] - t[j + 1];
-                a[k + l] = t[j] - t[j + 1];
-            }
-            l <<= 1;
-            mh = m >> 1;
-            for (j = 1; j < mh; j++) {
-                k = m - j;
-                t[j] = t[m + k] + t[m + j];
-                t[k] = t[m + k] - t[m + j];
-            }
-            t[0] = t[m + mh];
-            m = mh;
-        }
-        a[l] = t[0];
-    }
-    a[0] = 0;
-}
-
-
-/* -------- initializing routines -------- */
-
-
-#include <math.h>
-
-void makewt(int nw, int *ip, double *w)
-{
-    void makeipt(int nw, int *ip);
-    int j, nwh, nw0, nw1;
-    double delta, wn4r, wk1r, wk1i, wk3r, wk3i;
-    
-    ip[0] = nw;
-    ip[1] = 1;
-    if (nw > 2) {
-        nwh = nw >> 1;
-        delta = atan(1.0) / nwh;
-        wn4r = cos(delta * nwh);
-        w[0] = 1;
-        w[1] = wn4r;
-        if (nwh == 4) {
-            w[2] = cos(delta * 2);
-            w[3] = sin(delta * 2);
-        } else if (nwh > 4) {
-            makeipt(nw, ip);
-            w[2] = 0.5 / cos(delta * 2);
-            w[3] = 0.5 / cos(delta * 6);
-            for (j = 4; j < nwh; j += 4) {
-                w[j] = cos(delta * j);
-                w[j + 1] = sin(delta * j);
-                w[j + 2] = cos(3 * delta * j);
-                w[j + 3] = -sin(3 * delta * j);
-            }
-        }
-        nw0 = 0;
-        while (nwh > 2) {
-            nw1 = nw0 + nwh;
-            nwh >>= 1;
-            w[nw1] = 1;
-            w[nw1 + 1] = wn4r;
-            if (nwh == 4) {
-                wk1r = w[nw0 + 4];
-                wk1i = w[nw0 + 5];
-                w[nw1 + 2] = wk1r;
-                w[nw1 + 3] = wk1i;
-            } else if (nwh > 4) {
-                wk1r = w[nw0 + 4];
-                wk3r = w[nw0 + 6];
-                w[nw1 + 2] = 0.5 / wk1r;
-                w[nw1 + 3] = 0.5 / wk3r;
-                for (j = 4; j < nwh; j += 4) {
-                    wk1r = w[nw0 + 2 * j];
-                    wk1i = w[nw0 + 2 * j + 1];
-                    wk3r = w[nw0 + 2 * j + 2];
-                    wk3i = w[nw0 + 2 * j + 3];
-                    w[nw1 + j] = wk1r;
-                    w[nw1 + j + 1] = wk1i;
-                    w[nw1 + j + 2] = wk3r;
-                    w[nw1 + j + 3] = wk3i;
-                }
-            }
-            nw0 = nw1;
-        }
-    }
-}
-
-
-void makeipt(int nw, int *ip)
-{
-    int j, l, m, m2, p, q;
-    
-    ip[2] = 0;
-    ip[3] = 16;
-    m = 2;
-    for (l = nw; l > 32; l >>= 2) {
-        m2 = m << 1;
-        q = m2 << 3;
-        for (j = m; j < m2; j++) {
-            p = ip[j] << 2;
-            ip[m + j] = p;
-            ip[m2 + j] = p + q;
-        }
-        m = m2;
-    }
-}
-
-
-void makect(int nc, int *ip, double *c)
-{
-    int j, nch;
-    double delta;
-    
-    ip[1] = nc;
-    if (nc > 1) {
-        nch = nc >> 1;
-        delta = atan(1.0) / nch;
-        c[0] = cos(delta * nch);
-        c[nch] = 0.5 * c[0];
-        for (j = 1; j < nch; j++) {
-            c[j] = 0.5 * cos(delta * j);
-            c[nc - j] = 0.5 * sin(delta * j);
-        }
-    }
-}
-
-
-/* -------- child routines -------- */
-
-
-#ifdef USE_CDFT_PTHREADS
-#define USE_CDFT_THREADS
-#ifndef CDFT_THREADS_BEGIN_N
-#define CDFT_THREADS_BEGIN_N 8192
-#endif
-#ifndef CDFT_4THREADS_BEGIN_N
-#define CDFT_4THREADS_BEGIN_N 65536
-#endif
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#define cdft_thread_t pthread_t
-#define cdft_thread_create(thp,func,argp) { \
-    if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \
-        fprintf(stderr, "cdft thread error\n"); \
-        exit(1); \
-    } \
-}
-#define cdft_thread_wait(th) { \
-    if (pthread_join(th, NULL) != 0) { \
-        fprintf(stderr, "cdft thread error\n"); \
-        exit(1); \
-    } \
-}
-#endif /* USE_CDFT_PTHREADS */
-
-
-#ifdef USE_CDFT_WINTHREADS
-#define USE_CDFT_THREADS
-#ifndef CDFT_THREADS_BEGIN_N
-#define CDFT_THREADS_BEGIN_N 32768
-#endif
-#ifndef CDFT_4THREADS_BEGIN_N
-#define CDFT_4THREADS_BEGIN_N 524288
-#endif
-#include <windows.h>
-#include <stdio.h>
-#include <stdlib.h>
-#define cdft_thread_t HANDLE
-#define cdft_thread_create(thp,func,argp) { \
-    DWORD thid; \
-    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \
-    if (*(thp) == 0) { \
-        fprintf(stderr, "cdft thread error\n"); \
-        exit(1); \
-    } \
-}
-#define cdft_thread_wait(th) { \
-    WaitForSingleObject(th, INFINITE); \
-    CloseHandle(th); \
-}
-#endif /* USE_CDFT_WINTHREADS */
-
-
-void cftfsub(int n, double *a, int *ip, int nw, double *w)
-{
-    void bitrv2(int n, int *ip, double *a);
-    void bitrv216(double *a);
-    void bitrv208(double *a);
-    void cftf1st(int n, double *a, double *w);
-    void cftrec4(int n, double *a, int nw, double *w);
-    void cftleaf(int n, int isplt, double *a, int nw, double *w);
-    void cftfx41(int n, double *a, int nw, double *w);
-    void cftf161(double *a, double *w);
-    void cftf081(double *a, double *w);
-    void cftf040(double *a);
-    void cftx020(double *a);
-#ifdef USE_CDFT_THREADS
-    void cftrec4_th(int n, double *a, int nw, double *w);
-#endif /* USE_CDFT_THREADS */
-    
-    if (n > 8) {
-        if (n > 32) {
-            cftf1st(n, a, &w[nw - (n >> 2)]);
-#ifdef USE_CDFT_THREADS
-            if (n > CDFT_THREADS_BEGIN_N) {
-                cftrec4_th(n, a, nw, w);
-            } else 
-#endif /* USE_CDFT_THREADS */
-            if (n > 512) {
-                cftrec4(n, a, nw, w);
-            } else if (n > 128) {
-                cftleaf(n, 1, a, nw, w);
-            } else {
-                cftfx41(n, a, nw, w);
-            }
-            bitrv2(n, ip, a);
-        } else if (n == 32) {
-            cftf161(a, &w[nw - 8]);
-            bitrv216(a);
-        } else {
-            cftf081(a, w);
-            bitrv208(a);
-        }
-    } else if (n == 8) {
-        cftf040(a);
-    } else if (n == 4) {
-        cftx020(a);
-    }
-}
-
-
-void cftbsub(int n, double *a, int *ip, int nw, double *w)
-{
-    void bitrv2conj(int n, int *ip, double *a);
-    void bitrv216neg(double *a);
-    void bitrv208neg(double *a);
-    void cftb1st(int n, double *a, double *w);
-    void cftrec4(int n, double *a, int nw, double *w);
-    void cftleaf(int n, int isplt, double *a, int nw, double *w);
-    void cftfx41(int n, double *a, int nw, double *w);
-    void cftf161(double *a, double *w);
-    void cftf081(double *a, double *w);
-    void cftb040(double *a);
-    void cftx020(double *a);
-#ifdef USE_CDFT_THREADS
-    void cftrec4_th(int n, double *a, int nw, double *w);
-#endif /* USE_CDFT_THREADS */
-    
-    if (n > 8) {
-        if (n > 32) {
-            cftb1st(n, a, &w[nw - (n >> 2)]);
-#ifdef USE_CDFT_THREADS
-            if (n > CDFT_THREADS_BEGIN_N) {
-                cftrec4_th(n, a, nw, w);
-            } else 
-#endif /* USE_CDFT_THREADS */
-            if (n > 512) {
-                cftrec4(n, a, nw, w);
-            } else if (n > 128) {
-                cftleaf(n, 1, a, nw, w);
-            } else {
-                cftfx41(n, a, nw, w);
-            }
-            bitrv2conj(n, ip, a);
-        } else if (n == 32) {
-            cftf161(a, &w[nw - 8]);
-            bitrv216neg(a);
-        } else {
-            cftf081(a, w);
-            bitrv208neg(a);
-        }
-    } else if (n == 8) {
-        cftb040(a);
-    } else if (n == 4) {
-        cftx020(a);
-    }
-}
-
-
-void bitrv2(int n, int *ip, double *a)
-{
-    int j, j1, k, k1, l, m, nh, nm;
-    double xr, xi, yr, yi;
-    
-    m = 1;
-    for (l = n >> 2; l > 8; l >>= 2) {
-        m <<= 1;
-    }
-    nh = n >> 1;
-    nm = 4 * m;
-    if (l == 8) {
-        for (k = 0; k < m; k++) {
-            for (j = 0; j < k; j++) {
-                j1 = 4 * j + 2 * ip[m + k];
-                k1 = 4 * k + 2 * ip[m + j];
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += 2 * nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 -= nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += 2 * nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nh;
-                k1 += 2;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= 2 * nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 += nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= 2 * nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += 2;
-                k1 += nh;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += 2 * nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 -= nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += 2 * nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nh;
-                k1 -= 2;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= 2 * nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 += nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= 2 * nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-            }
-            k1 = 4 * k + 2 * ip[m + k];
-            j1 = k1 + 2;
-            k1 += nh;
-            xr = a[j1];
-            xi = a[j1 + 1];
-            yr = a[k1];
-            yi = a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 += nm;
-            k1 += 2 * nm;
-            xr = a[j1];
-            xi = a[j1 + 1];
-            yr = a[k1];
-            yi = a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 += nm;
-            k1 -= nm;
-            xr = a[j1];
-            xi = a[j1 + 1];
-            yr = a[k1];
-            yi = a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 -= 2;
-            k1 -= nh;
-            xr = a[j1];
-            xi = a[j1 + 1];
-            yr = a[k1];
-            yi = a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 += nh + 2;
-            k1 += nh + 2;
-            xr = a[j1];
-            xi = a[j1 + 1];
-            yr = a[k1];
-            yi = a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 -= nh - nm;
-            k1 += 2 * nm - 2;
-            xr = a[j1];
-            xi = a[j1 + 1];
-            yr = a[k1];
-            yi = a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-        }
-    } else {
-        for (k = 0; k < m; k++) {
-            for (j = 0; j < k; j++) {
-                j1 = 4 * j + ip[m + k];
-                k1 = 4 * k + ip[m + j];
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nh;
-                k1 += 2;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += 2;
-                k1 += nh;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nh;
-                k1 -= 2;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= nm;
-                xr = a[j1];
-                xi = a[j1 + 1];
-                yr = a[k1];
-                yi = a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-            }
-            k1 = 4 * k + ip[m + k];
-            j1 = k1 + 2;
-            k1 += nh;
-            xr = a[j1];
-            xi = a[j1 + 1];
-            yr = a[k1];
-            yi = a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 += nm;
-            k1 += nm;
-            xr = a[j1];
-            xi = a[j1 + 1];
-            yr = a[k1];
-            yi = a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-        }
-    }
-}
-
-
-void bitrv2conj(int n, int *ip, double *a)
-{
-    int j, j1, k, k1, l, m, nh, nm;
-    double xr, xi, yr, yi;
-    
-    m = 1;
-    for (l = n >> 2; l > 8; l >>= 2) {
-        m <<= 1;
-    }
-    nh = n >> 1;
-    nm = 4 * m;
-    if (l == 8) {
-        for (k = 0; k < m; k++) {
-            for (j = 0; j < k; j++) {
-                j1 = 4 * j + 2 * ip[m + k];
-                k1 = 4 * k + 2 * ip[m + j];
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += 2 * nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 -= nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += 2 * nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nh;
-                k1 += 2;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= 2 * nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 += nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= 2 * nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += 2;
-                k1 += nh;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += 2 * nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 -= nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += 2 * nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nh;
-                k1 -= 2;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= 2 * nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 += nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= 2 * nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-            }
-            k1 = 4 * k + 2 * ip[m + k];
-            j1 = k1 + 2;
-            k1 += nh;
-            a[j1 - 1] = -a[j1 - 1];
-            xr = a[j1];
-            xi = -a[j1 + 1];
-            yr = a[k1];
-            yi = -a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            a[k1 + 3] = -a[k1 + 3];
-            j1 += nm;
-            k1 += 2 * nm;
-            xr = a[j1];
-            xi = -a[j1 + 1];
-            yr = a[k1];
-            yi = -a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 += nm;
-            k1 -= nm;
-            xr = a[j1];
-            xi = -a[j1 + 1];
-            yr = a[k1];
-            yi = -a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 -= 2;
-            k1 -= nh;
-            xr = a[j1];
-            xi = -a[j1 + 1];
-            yr = a[k1];
-            yi = -a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 += nh + 2;
-            k1 += nh + 2;
-            xr = a[j1];
-            xi = -a[j1 + 1];
-            yr = a[k1];
-            yi = -a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            j1 -= nh - nm;
-            k1 += 2 * nm - 2;
-            a[j1 - 1] = -a[j1 - 1];
-            xr = a[j1];
-            xi = -a[j1 + 1];
-            yr = a[k1];
-            yi = -a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            a[k1 + 3] = -a[k1 + 3];
-        }
-    } else {
-        for (k = 0; k < m; k++) {
-            for (j = 0; j < k; j++) {
-                j1 = 4 * j + ip[m + k];
-                k1 = 4 * k + ip[m + j];
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nh;
-                k1 += 2;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += 2;
-                k1 += nh;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 += nm;
-                k1 += nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nh;
-                k1 -= 2;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-                j1 -= nm;
-                k1 -= nm;
-                xr = a[j1];
-                xi = -a[j1 + 1];
-                yr = a[k1];
-                yi = -a[k1 + 1];
-                a[j1] = yr;
-                a[j1 + 1] = yi;
-                a[k1] = xr;
-                a[k1 + 1] = xi;
-            }
-            k1 = 4 * k + ip[m + k];
-            j1 = k1 + 2;
-            k1 += nh;
-            a[j1 - 1] = -a[j1 - 1];
-            xr = a[j1];
-            xi = -a[j1 + 1];
-            yr = a[k1];
-            yi = -a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            a[k1 + 3] = -a[k1 + 3];
-            j1 += nm;
-            k1 += nm;
-            a[j1 - 1] = -a[j1 - 1];
-            xr = a[j1];
-            xi = -a[j1 + 1];
-            yr = a[k1];
-            yi = -a[k1 + 1];
-            a[j1] = yr;
-            a[j1 + 1] = yi;
-            a[k1] = xr;
-            a[k1 + 1] = xi;
-            a[k1 + 3] = -a[k1 + 3];
-        }
-    }
-}
-
-
-void bitrv216(double *a)
-{
-    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
-        x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, 
-        x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
-    
-    x1r = a[2];
-    x1i = a[3];
-    x2r = a[4];
-    x2i = a[5];
-    x3r = a[6];
-    x3i = a[7];
-    x4r = a[8];
-    x4i = a[9];
-    x5r = a[10];
-    x5i = a[11];
-    x7r = a[14];
-    x7i = a[15];
-    x8r = a[16];
-    x8i = a[17];
-    x10r = a[20];
-    x10i = a[21];
-    x11r = a[22];
-    x11i = a[23];
-    x12r = a[24];
-    x12i = a[25];
-    x13r = a[26];
-    x13i = a[27];
-    x14r = a[28];
-    x14i = a[29];
-    a[2] = x8r;
-    a[3] = x8i;
-    a[4] = x4r;
-    a[5] = x4i;
-    a[6] = x12r;
-    a[7] = x12i;
-    a[8] = x2r;
-    a[9] = x2i;
-    a[10] = x10r;
-    a[11] = x10i;
-    a[14] = x14r;
-    a[15] = x14i;
-    a[16] = x1r;
-    a[17] = x1i;
-    a[20] = x5r;
-    a[21] = x5i;
-    a[22] = x13r;
-    a[23] = x13i;
-    a[24] = x3r;
-    a[25] = x3i;
-    a[26] = x11r;
-    a[27] = x11i;
-    a[28] = x7r;
-    a[29] = x7i;
-}
-
-
-void bitrv216neg(double *a)
-{
-    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
-        x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, 
-        x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, 
-        x13r, x13i, x14r, x14i, x15r, x15i;
-    
-    x1r = a[2];
-    x1i = a[3];
-    x2r = a[4];
-    x2i = a[5];
-    x3r = a[6];
-    x3i = a[7];
-    x4r = a[8];
-    x4i = a[9];
-    x5r = a[10];
-    x5i = a[11];
-    x6r = a[12];
-    x6i = a[13];
-    x7r = a[14];
-    x7i = a[15];
-    x8r = a[16];
-    x8i = a[17];
-    x9r = a[18];
-    x9i = a[19];
-    x10r = a[20];
-    x10i = a[21];
-    x11r = a[22];
-    x11i = a[23];
-    x12r = a[24];
-    x12i = a[25];
-    x13r = a[26];
-    x13i = a[27];
-    x14r = a[28];
-    x14i = a[29];
-    x15r = a[30];
-    x15i = a[31];
-    a[2] = x15r;
-    a[3] = x15i;
-    a[4] = x7r;
-    a[5] = x7i;
-    a[6] = x11r;
-    a[7] = x11i;
-    a[8] = x3r;
-    a[9] = x3i;
-    a[10] = x13r;
-    a[11] = x13i;
-    a[12] = x5r;
-    a[13] = x5i;
-    a[14] = x9r;
-    a[15] = x9i;
-    a[16] = x1r;
-    a[17] = x1i;
-    a[18] = x14r;
-    a[19] = x14i;
-    a[20] = x6r;
-    a[21] = x6i;
-    a[22] = x10r;
-    a[23] = x10i;
-    a[24] = x2r;
-    a[25] = x2i;
-    a[26] = x12r;
-    a[27] = x12i;
-    a[28] = x4r;
-    a[29] = x4i;
-    a[30] = x8r;
-    a[31] = x8i;
-}
-
-
-void bitrv208(double *a)
-{
-    double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
-    
-    x1r = a[2];
-    x1i = a[3];
-    x3r = a[6];
-    x3i = a[7];
-    x4r = a[8];
-    x4i = a[9];
-    x6r = a[12];
-    x6i = a[13];
-    a[2] = x4r;
-    a[3] = x4i;
-    a[6] = x6r;
-    a[7] = x6i;
-    a[8] = x1r;
-    a[9] = x1i;
-    a[12] = x3r;
-    a[13] = x3i;
-}
-
-
-void bitrv208neg(double *a)
-{
-    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
-        x5r, x5i, x6r, x6i, x7r, x7i;
-    
-    x1r = a[2];
-    x1i = a[3];
-    x2r = a[4];
-    x2i = a[5];
-    x3r = a[6];
-    x3i = a[7];
-    x4r = a[8];
-    x4i = a[9];
-    x5r = a[10];
-    x5i = a[11];
-    x6r = a[12];
-    x6i = a[13];
-    x7r = a[14];
-    x7i = a[15];
-    a[2] = x7r;
-    a[3] = x7i;
-    a[4] = x3r;
-    a[5] = x3i;
-    a[6] = x5r;
-    a[7] = x5i;
-    a[8] = x1r;
-    a[9] = x1i;
-    a[10] = x6r;
-    a[11] = x6i;
-    a[12] = x2r;
-    a[13] = x2i;
-    a[14] = x4r;
-    a[15] = x4i;
-}
-
-
-void cftf1st(int n, double *a, double *w)
-{
-    int j, j0, j1, j2, j3, k, m, mh;
-    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, 
-        wd1r, wd1i, wd3r, wd3i;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
-        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
-    
-    mh = n >> 3;
-    m = 2 * mh;
-    j1 = m;
-    j2 = j1 + m;
-    j3 = j2 + m;
-    x0r = a[0] + a[j2];
-    x0i = a[1] + a[j2 + 1];
-    x1r = a[0] - a[j2];
-    x1i = a[1] - a[j2 + 1];
-    x2r = a[j1] + a[j3];
-    x2i = a[j1 + 1] + a[j3 + 1];
-    x3r = a[j1] - a[j3];
-    x3i = a[j1 + 1] - a[j3 + 1];
-    a[0] = x0r + x2r;
-    a[1] = x0i + x2i;
-    a[j1] = x0r - x2r;
-    a[j1 + 1] = x0i - x2i;
-    a[j2] = x1r - x3i;
-    a[j2 + 1] = x1i + x3r;
-    a[j3] = x1r + x3i;
-    a[j3 + 1] = x1i - x3r;
-    wn4r = w[1];
-    csc1 = w[2];
-    csc3 = w[3];
-    wd1r = 1;
-    wd1i = 0;
-    wd3r = 1;
-    wd3i = 0;
-    k = 0;
-    for (j = 2; j < mh - 2; j += 4) {
-        k += 4;
-        wk1r = csc1 * (wd1r + w[k]);
-        wk1i = csc1 * (wd1i + w[k + 1]);
-        wk3r = csc3 * (wd3r + w[k + 2]);
-        wk3i = csc3 * (wd3i + w[k + 3]);
-        wd1r = w[k];
-        wd1i = w[k + 1];
-        wd3r = w[k + 2];
-        wd3i = w[k + 3];
-        j1 = j + m;
-        j2 = j1 + m;
-        j3 = j2 + m;
-        x0r = a[j] + a[j2];
-        x0i = a[j + 1] + a[j2 + 1];
-        x1r = a[j] - a[j2];
-        x1i = a[j + 1] - a[j2 + 1];
-        y0r = a[j + 2] + a[j2 + 2];
-        y0i = a[j + 3] + a[j2 + 3];
-        y1r = a[j + 2] - a[j2 + 2];
-        y1i = a[j + 3] - a[j2 + 3];
-        x2r = a[j1] + a[j3];
-        x2i = a[j1 + 1] + a[j3 + 1];
-        x3r = a[j1] - a[j3];
-        x3i = a[j1 + 1] - a[j3 + 1];
-        y2r = a[j1 + 2] + a[j3 + 2];
-        y2i = a[j1 + 3] + a[j3 + 3];
-        y3r = a[j1 + 2] - a[j3 + 2];
-        y3i = a[j1 + 3] - a[j3 + 3];
-        a[j] = x0r + x2r;
-        a[j + 1] = x0i + x2i;
-        a[j + 2] = y0r + y2r;
-        a[j + 3] = y0i + y2i;
-        a[j1] = x0r - x2r;
-        a[j1 + 1] = x0i - x2i;
-        a[j1 + 2] = y0r - y2r;
-        a[j1 + 3] = y0i - y2i;
-        x0r = x1r - x3i;
-        x0i = x1i + x3r;
-        a[j2] = wk1r * x0r - wk1i * x0i;
-        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
-        x0r = y1r - y3i;
-        x0i = y1i + y3r;
-        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
-        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
-        x0r = x1r + x3i;
-        x0i = x1i - x3r;
-        a[j3] = wk3r * x0r + wk3i * x0i;
-        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
-        x0r = y1r + y3i;
-        x0i = y1i - y3r;
-        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
-        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
-        j0 = m - j;
-        j1 = j0 + m;
-        j2 = j1 + m;
-        j3 = j2 + m;
-        x0r = a[j0] + a[j2];
-        x0i = a[j0 + 1] + a[j2 + 1];
-        x1r = a[j0] - a[j2];
-        x1i = a[j0 + 1] - a[j2 + 1];
-        y0r = a[j0 - 2] + a[j2 - 2];
-        y0i = a[j0 - 1] + a[j2 - 1];
-        y1r = a[j0 - 2] - a[j2 - 2];
-        y1i = a[j0 - 1] - a[j2 - 1];
-        x2r = a[j1] + a[j3];
-        x2i = a[j1 + 1] + a[j3 + 1];
-        x3r = a[j1] - a[j3];
-        x3i = a[j1 + 1] - a[j3 + 1];
-        y2r = a[j1 - 2] + a[j3 - 2];
-        y2i = a[j1 - 1] + a[j3 - 1];
-        y3r = a[j1 - 2] - a[j3 - 2];
-        y3i = a[j1 - 1] - a[j3 - 1];
-        a[j0] = x0r + x2r;
-        a[j0 + 1] = x0i + x2i;
-        a[j0 - 2] = y0r + y2r;
-        a[j0 - 1] = y0i + y2i;
-        a[j1] = x0r - x2r;
-        a[j1 + 1] = x0i - x2i;
-        a[j1 - 2] = y0r - y2r;
-        a[j1 - 1] = y0i - y2i;
-        x0r = x1r - x3i;
-        x0i = x1i + x3r;
-        a[j2] = wk1i * x0r - wk1r * x0i;
-        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
-        x0r = y1r - y3i;
-        x0i = y1i + y3r;
-        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
-        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
-        x0r = x1r + x3i;
-        x0i = x1i - x3r;
-        a[j3] = wk3i * x0r + wk3r * x0i;
-        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
-        x0r = y1r + y3i;
-        x0i = y1i - y3r;
-        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
-        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
-    }
-    wk1r = csc1 * (wd1r + wn4r);
-    wk1i = csc1 * (wd1i + wn4r);
-    wk3r = csc3 * (wd3r - wn4r);
-    wk3i = csc3 * (wd3i - wn4r);
-    j0 = mh;
-    j1 = j0 + m;
-    j2 = j1 + m;
-    j3 = j2 + m;
-    x0r = a[j0 - 2] + a[j2 - 2];
-    x0i = a[j0 - 1] + a[j2 - 1];
-    x1r = a[j0 - 2] - a[j2 - 2];
-    x1i = a[j0 - 1] - a[j2 - 1];
-    x2r = a[j1 - 2] + a[j3 - 2];
-    x2i = a[j1 - 1] + a[j3 - 1];
-    x3r = a[j1 - 2] - a[j3 - 2];
-    x3i = a[j1 - 1] - a[j3 - 1];
-    a[j0 - 2] = x0r + x2r;
-    a[j0 - 1] = x0i + x2i;
-    a[j1 - 2] = x0r - x2r;
-    a[j1 - 1] = x0i - x2i;
-    x0r = x1r - x3i;
-    x0i = x1i + x3r;
-    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
-    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
-    x0r = x1r + x3i;
-    x0i = x1i - x3r;
-    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
-    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
-    x0r = a[j0] + a[j2];
-    x0i = a[j0 + 1] + a[j2 + 1];
-    x1r = a[j0] - a[j2];
-    x1i = a[j0 + 1] - a[j2 + 1];
-    x2r = a[j1] + a[j3];
-    x2i = a[j1 + 1] + a[j3 + 1];
-    x3r = a[j1] - a[j3];
-    x3i = a[j1 + 1] - a[j3 + 1];
-    a[j0] = x0r + x2r;
-    a[j0 + 1] = x0i + x2i;
-    a[j1] = x0r - x2r;
-    a[j1 + 1] = x0i - x2i;
-    x0r = x1r - x3i;
-    x0i = x1i + x3r;
-    a[j2] = wn4r * (x0r - x0i);
-    a[j2 + 1] = wn4r * (x0i + x0r);
-    x0r = x1r + x3i;
-    x0i = x1i - x3r;
-    a[j3] = -wn4r * (x0r + x0i);
-    a[j3 + 1] = -wn4r * (x0i - x0r);
-    x0r = a[j0 + 2] + a[j2 + 2];
-    x0i = a[j0 + 3] + a[j2 + 3];
-    x1r = a[j0 + 2] - a[j2 + 2];
-    x1i = a[j0 + 3] - a[j2 + 3];
-    x2r = a[j1 + 2] + a[j3 + 2];
-    x2i = a[j1 + 3] + a[j3 + 3];
-    x3r = a[j1 + 2] - a[j3 + 2];
-    x3i = a[j1 + 3] - a[j3 + 3];
-    a[j0 + 2] = x0r + x2r;
-    a[j0 + 3] = x0i + x2i;
-    a[j1 + 2] = x0r - x2r;
-    a[j1 + 3] = x0i - x2i;
-    x0r = x1r - x3i;
-    x0i = x1i + x3r;
-    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
-    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
-    x0r = x1r + x3i;
-    x0i = x1i - x3r;
-    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
-    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
-}
-
-
-void cftb1st(int n, double *a, double *w)
-{
-    int j, j0, j1, j2, j3, k, m, mh;
-    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, 
-        wd1r, wd1i, wd3r, wd3i;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
-        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
-    
-    mh = n >> 3;
-    m = 2 * mh;
-    j1 = m;
-    j2 = j1 + m;
-    j3 = j2 + m;
-    x0r = a[0] + a[j2];
-    x0i = -a[1] - a[j2 + 1];
-    x1r = a[0] - a[j2];
-    x1i = -a[1] + a[j2 + 1];
-    x2r = a[j1] + a[j3];
-    x2i = a[j1 + 1] + a[j3 + 1];
-    x3r = a[j1] - a[j3];
-    x3i = a[j1 + 1] - a[j3 + 1];
-    a[0] = x0r + x2r;
-    a[1] = x0i - x2i;
-    a[j1] = x0r - x2r;
-    a[j1 + 1] = x0i + x2i;
-    a[j2] = x1r + x3i;
-    a[j2 + 1] = x1i + x3r;
-    a[j3] = x1r - x3i;
-    a[j3 + 1] = x1i - x3r;
-    wn4r = w[1];
-    csc1 = w[2];
-    csc3 = w[3];
-    wd1r = 1;
-    wd1i = 0;
-    wd3r = 1;
-    wd3i = 0;
-    k = 0;
-    for (j = 2; j < mh - 2; j += 4) {
-        k += 4;
-        wk1r = csc1 * (wd1r + w[k]);
-        wk1i = csc1 * (wd1i + w[k + 1]);
-        wk3r = csc3 * (wd3r + w[k + 2]);
-        wk3i = csc3 * (wd3i + w[k + 3]);
-        wd1r = w[k];
-        wd1i = w[k + 1];
-        wd3r = w[k + 2];
-        wd3i = w[k + 3];
-        j1 = j + m;
-        j2 = j1 + m;
-        j3 = j2 + m;
-        x0r = a[j] + a[j2];
-        x0i = -a[j + 1] - a[j2 + 1];
-        x1r = a[j] - a[j2];
-        x1i = -a[j + 1] + a[j2 + 1];
-        y0r = a[j + 2] + a[j2 + 2];
-        y0i = -a[j + 3] - a[j2 + 3];
-        y1r = a[j + 2] - a[j2 + 2];
-        y1i = -a[j + 3] + a[j2 + 3];
-        x2r = a[j1] + a[j3];
-        x2i = a[j1 + 1] + a[j3 + 1];
-        x3r = a[j1] - a[j3];
-        x3i = a[j1 + 1] - a[j3 + 1];
-        y2r = a[j1 + 2] + a[j3 + 2];
-        y2i = a[j1 + 3] + a[j3 + 3];
-        y3r = a[j1 + 2] - a[j3 + 2];
-        y3i = a[j1 + 3] - a[j3 + 3];
-        a[j] = x0r + x2r;
-        a[j + 1] = x0i - x2i;
-        a[j + 2] = y0r + y2r;
-        a[j + 3] = y0i - y2i;
-        a[j1] = x0r - x2r;
-        a[j1 + 1] = x0i + x2i;
-        a[j1 + 2] = y0r - y2r;
-        a[j1 + 3] = y0i + y2i;
-        x0r = x1r + x3i;
-        x0i = x1i + x3r;
-        a[j2] = wk1r * x0r - wk1i * x0i;
-        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
-        x0r = y1r + y3i;
-        x0i = y1i + y3r;
-        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
-        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
-        x0r = x1r - x3i;
-        x0i = x1i - x3r;
-        a[j3] = wk3r * x0r + wk3i * x0i;
-        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
-        x0r = y1r - y3i;
-        x0i = y1i - y3r;
-        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
-        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
-        j0 = m - j;
-        j1 = j0 + m;
-        j2 = j1 + m;
-        j3 = j2 + m;
-        x0r = a[j0] + a[j2];
-        x0i = -a[j0 + 1] - a[j2 + 1];
-        x1r = a[j0] - a[j2];
-        x1i = -a[j0 + 1] + a[j2 + 1];
-        y0r = a[j0 - 2] + a[j2 - 2];
-        y0i = -a[j0 - 1] - a[j2 - 1];
-        y1r = a[j0 - 2] - a[j2 - 2];
-        y1i = -a[j0 - 1] + a[j2 - 1];
-        x2r = a[j1] + a[j3];
-        x2i = a[j1 + 1] + a[j3 + 1];
-        x3r = a[j1] - a[j3];
-        x3i = a[j1 + 1] - a[j3 + 1];
-        y2r = a[j1 - 2] + a[j3 - 2];
-        y2i = a[j1 - 1] + a[j3 - 1];
-        y3r = a[j1 - 2] - a[j3 - 2];
-        y3i = a[j1 - 1] - a[j3 - 1];
-        a[j0] = x0r + x2r;
-        a[j0 + 1] = x0i - x2i;
-        a[j0 - 2] = y0r + y2r;
-        a[j0 - 1] = y0i - y2i;
-        a[j1] = x0r - x2r;
-        a[j1 + 1] = x0i + x2i;
-        a[j1 - 2] = y0r - y2r;
-        a[j1 - 1] = y0i + y2i;
-        x0r = x1r + x3i;
-        x0i = x1i + x3r;
-        a[j2] = wk1i * x0r - wk1r * x0i;
-        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
-        x0r = y1r + y3i;
-        x0i = y1i + y3r;
-        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
-        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
-        x0r = x1r - x3i;
-        x0i = x1i - x3r;
-        a[j3] = wk3i * x0r + wk3r * x0i;
-        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
-        x0r = y1r - y3i;
-        x0i = y1i - y3r;
-        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
-        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
-    }
-    wk1r = csc1 * (wd1r + wn4r);
-    wk1i = csc1 * (wd1i + wn4r);
-    wk3r = csc3 * (wd3r - wn4r);
-    wk3i = csc3 * (wd3i - wn4r);
-    j0 = mh;
-    j1 = j0 + m;
-    j2 = j1 + m;
-    j3 = j2 + m;
-    x0r = a[j0 - 2] + a[j2 - 2];
-    x0i = -a[j0 - 1] - a[j2 - 1];
-    x1r = a[j0 - 2] - a[j2 - 2];
-    x1i = -a[j0 - 1] + a[j2 - 1];
-    x2r = a[j1 - 2] + a[j3 - 2];
-    x2i = a[j1 - 1] + a[j3 - 1];
-    x3r = a[j1 - 2] - a[j3 - 2];
-    x3i = a[j1 - 1] - a[j3 - 1];
-    a[j0 - 2] = x0r + x2r;
-    a[j0 - 1] = x0i - x2i;
-    a[j1 - 2] = x0r - x2r;
-    a[j1 - 1] = x0i + x2i;
-    x0r = x1r + x3i;
-    x0i = x1i + x3r;
-    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
-    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
-    x0r = x1r - x3i;
-    x0i = x1i - x3r;
-    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
-    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
-    x0r = a[j0] + a[j2];
-    x0i = -a[j0 + 1] - a[j2 + 1];
-    x1r = a[j0] - a[j2];
-    x1i = -a[j0 + 1] + a[j2 + 1];
-    x2r = a[j1] + a[j3];
-    x2i = a[j1 + 1] + a[j3 + 1];
-    x3r = a[j1] - a[j3];
-    x3i = a[j1 + 1] - a[j3 + 1];
-    a[j0] = x0r + x2r;
-    a[j0 + 1] = x0i - x2i;
-    a[j1] = x0r - x2r;
-    a[j1 + 1] = x0i + x2i;
-    x0r = x1r + x3i;
-    x0i = x1i + x3r;
-    a[j2] = wn4r * (x0r - x0i);
-    a[j2 + 1] = wn4r * (x0i + x0r);
-    x0r = x1r - x3i;
-    x0i = x1i - x3r;
-    a[j3] = -wn4r * (x0r + x0i);
-    a[j3 + 1] = -wn4r * (x0i - x0r);
-    x0r = a[j0 + 2] + a[j2 + 2];
-    x0i = -a[j0 + 3] - a[j2 + 3];
-    x1r = a[j0 + 2] - a[j2 + 2];
-    x1i = -a[j0 + 3] + a[j2 + 3];
-    x2r = a[j1 + 2] + a[j3 + 2];
-    x2i = a[j1 + 3] + a[j3 + 3];
-    x3r = a[j1 + 2] - a[j3 + 2];
-    x3i = a[j1 + 3] - a[j3 + 3];
-    a[j0 + 2] = x0r + x2r;
-    a[j0 + 3] = x0i - x2i;
-    a[j1 + 2] = x0r - x2r;
-    a[j1 + 3] = x0i + x2i;
-    x0r = x1r + x3i;
-    x0i = x1i + x3r;
-    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
-    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
-    x0r = x1r - x3i;
-    x0i = x1i - x3r;
-    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
-    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
-}
-
-
-#ifdef USE_CDFT_THREADS
-struct cdft_arg_st {
-    int n0;
-    int n;
-    double *a;
-    int nw;
-    double *w;
-};
-typedef struct cdft_arg_st cdft_arg_t;
-
-
-void cftrec4_th(int n, double *a, int nw, double *w)
-{
-    void *cftrec1_th(void *p);
-    void *cftrec2_th(void *p);
-    int i, idiv4, m, nthread;
-    cdft_thread_t th[4];
-    cdft_arg_t ag[4];
-    
-    nthread = 2;
-    idiv4 = 0;
-    m = n >> 1;
-    if (n > CDFT_4THREADS_BEGIN_N) {
-        nthread = 4;
-        idiv4 = 1;
-        m >>= 1;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].n0 = n;
-        ag[i].n = m;
-        ag[i].a = &a[i * m];
-        ag[i].nw = nw;
-        ag[i].w = w;
-        if (i != idiv4) {
-            cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
-        } else {
-            cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
-        }
-    }
-    for (i = 0; i < nthread; i++) {
-        cdft_thread_wait(th[i]);
-    }
-}
-
-
-void *cftrec1_th(void *p)
-{
-    int cfttree(int n, int j, int k, double *a, int nw, double *w);
-    void cftleaf(int n, int isplt, double *a, int nw, double *w);
-    void cftmdl1(int n, double *a, double *w);
-    int isplt, j, k, m, n, n0, nw;
-    double *a, *w;
-    
-    n0 = ((cdft_arg_t *) p)->n0;
-    n = ((cdft_arg_t *) p)->n;
-    a = ((cdft_arg_t *) p)->a;
-    nw = ((cdft_arg_t *) p)->nw;
-    w = ((cdft_arg_t *) p)->w;
-    m = n0;
-    while (m > 512) {
-        m >>= 2;
-        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
-    }
-    cftleaf(m, 1, &a[n - m], nw, w);
-    k = 0;
-    for (j = n - m; j > 0; j -= m) {
-        k++;
-        isplt = cfttree(m, j, k, a, nw, w);
-        cftleaf(m, isplt, &a[j - m], nw, w);
-    }
-    return (void *) 0;
-}
-
-
-void *cftrec2_th(void *p)
-{
-    int cfttree(int n, int j, int k, double *a, int nw, double *w);
-    void cftleaf(int n, int isplt, double *a, int nw, double *w);
-    void cftmdl2(int n, double *a, double *w);
-    int isplt, j, k, m, n, n0, nw;
-    double *a, *w;
-    
-    n0 = ((cdft_arg_t *) p)->n0;
-    n = ((cdft_arg_t *) p)->n;
-    a = ((cdft_arg_t *) p)->a;
-    nw = ((cdft_arg_t *) p)->nw;
-    w = ((cdft_arg_t *) p)->w;
-    k = 1;
-    m = n0;
-    while (m > 512) {
-        m >>= 2;
-        k <<= 2;
-        cftmdl2(m, &a[n - m], &w[nw - m]);
-    }
-    cftleaf(m, 0, &a[n - m], nw, w);
-    k >>= 1;
-    for (j = n - m; j > 0; j -= m) {
-        k++;
-        isplt = cfttree(m, j, k, a, nw, w);
-        cftleaf(m, isplt, &a[j - m], nw, w);
-    }
-    return (void *) 0;
-}
-#endif /* USE_CDFT_THREADS */
-
-
-void cftrec4(int n, double *a, int nw, double *w)
-{
-    int cfttree(int n, int j, int k, double *a, int nw, double *w);
-    void cftleaf(int n, int isplt, double *a, int nw, double *w);
-    void cftmdl1(int n, double *a, double *w);
-    int isplt, j, k, m;
-    
-    m = n;
-    while (m > 512) {
-        m >>= 2;
-        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
-    }
-    cftleaf(m, 1, &a[n - m], nw, w);
-    k = 0;
-    for (j = n - m; j > 0; j -= m) {
-        k++;
-        isplt = cfttree(m, j, k, a, nw, w);
-        cftleaf(m, isplt, &a[j - m], nw, w);
-    }
-}
-
-
-int cfttree(int n, int j, int k, double *a, int nw, double *w)
-{
-    void cftmdl1(int n, double *a, double *w);
-    void cftmdl2(int n, double *a, double *w);
-    int i, isplt, m;
-    
-    if ((k & 3) != 0) {
-        isplt = k & 1;
-        if (isplt != 0) {
-            cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]);
-        } else {
-            cftmdl2(n, &a[j - n], &w[nw - n]);
-        }
-    } else {
-        m = n;
-        for (i = k; (i & 3) == 0; i >>= 2) {
-            m <<= 2;
-        }
-        isplt = i & 1;
-        if (isplt != 0) {
-            while (m > 128) {
-                cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]);
-                m >>= 2;
-            }
-        } else {
-            while (m > 128) {
-                cftmdl2(m, &a[j - m], &w[nw - m]);
-                m >>= 2;
-            }
-        }
-    }
-    return isplt;
-}
-
-
-void cftleaf(int n, int isplt, double *a, int nw, double *w)
-{
-    void cftmdl1(int n, double *a, double *w);
-    void cftmdl2(int n, double *a, double *w);
-    void cftf161(double *a, double *w);
-    void cftf162(double *a, double *w);
-    void cftf081(double *a, double *w);
-    void cftf082(double *a, double *w);
-    
-    if (n == 512) {
-        cftmdl1(128, a, &w[nw - 64]);
-        cftf161(a, &w[nw - 8]);
-        cftf162(&a[32], &w[nw - 32]);
-        cftf161(&a[64], &w[nw - 8]);
-        cftf161(&a[96], &w[nw - 8]);
-        cftmdl2(128, &a[128], &w[nw - 128]);
-        cftf161(&a[128], &w[nw - 8]);
-        cftf162(&a[160], &w[nw - 32]);
-        cftf161(&a[192], &w[nw - 8]);
-        cftf162(&a[224], &w[nw - 32]);
-        cftmdl1(128, &a[256], &w[nw - 64]);
-        cftf161(&a[256], &w[nw - 8]);
-        cftf162(&a[288], &w[nw - 32]);
-        cftf161(&a[320], &w[nw - 8]);
-        cftf161(&a[352], &w[nw - 8]);
-        if (isplt != 0) {
-            cftmdl1(128, &a[384], &w[nw - 64]);
-            cftf161(&a[480], &w[nw - 8]);
-        } else {
-            cftmdl2(128, &a[384], &w[nw - 128]);
-            cftf162(&a[480], &w[nw - 32]);
-        }
-        cftf161(&a[384], &w[nw - 8]);
-        cftf162(&a[416], &w[nw - 32]);
-        cftf161(&a[448], &w[nw - 8]);
-    } else {
-        cftmdl1(64, a, &w[nw - 32]);
-        cftf081(a, &w[nw - 8]);
-        cftf082(&a[16], &w[nw - 8]);
-        cftf081(&a[32], &w[nw - 8]);
-        cftf081(&a[48], &w[nw - 8]);
-        cftmdl2(64, &a[64], &w[nw - 64]);
-        cftf081(&a[64], &w[nw - 8]);
-        cftf082(&a[80], &w[nw - 8]);
-        cftf081(&a[96], &w[nw - 8]);
-        cftf082(&a[112], &w[nw - 8]);
-        cftmdl1(64, &a[128], &w[nw - 32]);
-        cftf081(&a[128], &w[nw - 8]);
-        cftf082(&a[144], &w[nw - 8]);
-        cftf081(&a[160], &w[nw - 8]);
-        cftf081(&a[176], &w[nw - 8]);
-        if (isplt != 0) {
-            cftmdl1(64, &a[192], &w[nw - 32]);
-            cftf081(&a[240], &w[nw - 8]);
-        } else {
-            cftmdl2(64, &a[192], &w[nw - 64]);
-            cftf082(&a[240], &w[nw - 8]);
-        }
-        cftf081(&a[192], &w[nw - 8]);
-        cftf082(&a[208], &w[nw - 8]);
-        cftf081(&a[224], &w[nw - 8]);
-    }
-}
-
-
-void cftmdl1(int n, double *a, double *w)
-{
-    int j, j0, j1, j2, j3, k, m, mh;
-    double wn4r, wk1r, wk1i, wk3r, wk3i;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    
-    mh = n >> 3;
-    m = 2 * mh;
-    j1 = m;
-    j2 = j1 + m;
-    j3 = j2 + m;
-    x0r = a[0] + a[j2];
-    x0i = a[1] + a[j2 + 1];
-    x1r = a[0] - a[j2];
-    x1i = a[1] - a[j2 + 1];
-    x2r = a[j1] + a[j3];
-    x2i = a[j1 + 1] + a[j3 + 1];
-    x3r = a[j1] - a[j3];
-    x3i = a[j1 + 1] - a[j3 + 1];
-    a[0] = x0r + x2r;
-    a[1] = x0i + x2i;
-    a[j1] = x0r - x2r;
-    a[j1 + 1] = x0i - x2i;
-    a[j2] = x1r - x3i;
-    a[j2 + 1] = x1i + x3r;
-    a[j3] = x1r + x3i;
-    a[j3 + 1] = x1i - x3r;
-    wn4r = w[1];
-    k = 0;
-    for (j = 2; j < mh; j += 2) {
-        k += 4;
-        wk1r = w[k];
-        wk1i = w[k + 1];
-        wk3r = w[k + 2];
-        wk3i = w[k + 3];
-        j1 = j + m;
-        j2 = j1 + m;
-        j3 = j2 + m;
-        x0r = a[j] + a[j2];
-        x0i = a[j + 1] + a[j2 + 1];
-        x1r = a[j] - a[j2];
-        x1i = a[j + 1] - a[j2 + 1];
-        x2r = a[j1] + a[j3];
-        x2i = a[j1 + 1] + a[j3 + 1];
-        x3r = a[j1] - a[j3];
-        x3i = a[j1 + 1] - a[j3 + 1];
-        a[j] = x0r + x2r;
-        a[j + 1] = x0i + x2i;
-        a[j1] = x0r - x2r;
-        a[j1 + 1] = x0i - x2i;
-        x0r = x1r - x3i;
-        x0i = x1i + x3r;
-        a[j2] = wk1r * x0r - wk1i * x0i;
-        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
-        x0r = x1r + x3i;
-        x0i = x1i - x3r;
-        a[j3] = wk3r * x0r + wk3i * x0i;
-        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
-        j0 = m - j;
-        j1 = j0 + m;
-        j2 = j1 + m;
-        j3 = j2 + m;
-        x0r = a[j0] + a[j2];
-        x0i = a[j0 + 1] + a[j2 + 1];
-        x1r = a[j0] - a[j2];
-        x1i = a[j0 + 1] - a[j2 + 1];
-        x2r = a[j1] + a[j3];
-        x2i = a[j1 + 1] + a[j3 + 1];
-        x3r = a[j1] - a[j3];
-        x3i = a[j1 + 1] - a[j3 + 1];
-        a[j0] = x0r + x2r;
-        a[j0 + 1] = x0i + x2i;
-        a[j1] = x0r - x2r;
-        a[j1 + 1] = x0i - x2i;
-        x0r = x1r - x3i;
-        x0i = x1i + x3r;
-        a[j2] = wk1i * x0r - wk1r * x0i;
-        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
-        x0r = x1r + x3i;
-        x0i = x1i - x3r;
-        a[j3] = wk3i * x0r + wk3r * x0i;
-        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
-    }
-    j0 = mh;
-    j1 = j0 + m;
-    j2 = j1 + m;
-    j3 = j2 + m;
-    x0r = a[j0] + a[j2];
-    x0i = a[j0 + 1] + a[j2 + 1];
-    x1r = a[j0] - a[j2];
-    x1i = a[j0 + 1] - a[j2 + 1];
-    x2r = a[j1] + a[j3];
-    x2i = a[j1 + 1] + a[j3 + 1];
-    x3r = a[j1] - a[j3];
-    x3i = a[j1 + 1] - a[j3 + 1];
-    a[j0] = x0r + x2r;
-    a[j0 + 1] = x0i + x2i;
-    a[j1] = x0r - x2r;
-    a[j1 + 1] = x0i - x2i;
-    x0r = x1r - x3i;
-    x0i = x1i + x3r;
-    a[j2] = wn4r * (x0r - x0i);
-    a[j2 + 1] = wn4r * (x0i + x0r);
-    x0r = x1r + x3i;
-    x0i = x1i - x3r;
-    a[j3] = -wn4r * (x0r + x0i);
-    a[j3 + 1] = -wn4r * (x0i - x0r);
-}
-
-
-void cftmdl2(int n, double *a, double *w)
-{
-    int j, j0, j1, j2, j3, k, kr, m, mh;
-    double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
-    
-    mh = n >> 3;
-    m = 2 * mh;
-    wn4r = w[1];
-    j1 = m;
-    j2 = j1 + m;
-    j3 = j2 + m;
-    x0r = a[0] - a[j2 + 1];
-    x0i = a[1] + a[j2];
-    x1r = a[0] + a[j2 + 1];
-    x1i = a[1] - a[j2];
-    x2r = a[j1] - a[j3 + 1];
-    x2i = a[j1 + 1] + a[j3];
-    x3r = a[j1] + a[j3 + 1];
-    x3i = a[j1 + 1] - a[j3];
-    y0r = wn4r * (x2r - x2i);
-    y0i = wn4r * (x2i + x2r);
-    a[0] = x0r + y0r;
-    a[1] = x0i + y0i;
-    a[j1] = x0r - y0r;
-    a[j1 + 1] = x0i - y0i;
-    y0r = wn4r * (x3r - x3i);
-    y0i = wn4r * (x3i + x3r);
-    a[j2] = x1r - y0i;
-    a[j2 + 1] = x1i + y0r;
-    a[j3] = x1r + y0i;
-    a[j3 + 1] = x1i - y0r;
-    k = 0;
-    kr = 2 * m;
-    for (j = 2; j < mh; j += 2) {
-        k += 4;
-        wk1r = w[k];
-        wk1i = w[k + 1];
-        wk3r = w[k + 2];
-        wk3i = w[k + 3];
-        kr -= 4;
-        wd1i = w[kr];
-        wd1r = w[kr + 1];
-        wd3i = w[kr + 2];
-        wd3r = w[kr + 3];
-        j1 = j + m;
-        j2 = j1 + m;
-        j3 = j2 + m;
-        x0r = a[j] - a[j2 + 1];
-        x0i = a[j + 1] + a[j2];
-        x1r = a[j] + a[j2 + 1];
-        x1i = a[j + 1] - a[j2];
-        x2r = a[j1] - a[j3 + 1];
-        x2i = a[j1 + 1] + a[j3];
-        x3r = a[j1] + a[j3 + 1];
-        x3i = a[j1 + 1] - a[j3];
-        y0r = wk1r * x0r - wk1i * x0i;
-        y0i = wk1r * x0i + wk1i * x0r;
-        y2r = wd1r * x2r - wd1i * x2i;
-        y2i = wd1r * x2i + wd1i * x2r;
-        a[j] = y0r + y2r;
-        a[j + 1] = y0i + y2i;
-        a[j1] = y0r - y2r;
-        a[j1 + 1] = y0i - y2i;
-        y0r = wk3r * x1r + wk3i * x1i;
-        y0i = wk3r * x1i - wk3i * x1r;
-        y2r = wd3r * x3r + wd3i * x3i;
-        y2i = wd3r * x3i - wd3i * x3r;
-        a[j2] = y0r + y2r;
-        a[j2 + 1] = y0i + y2i;
-        a[j3] = y0r - y2r;
-        a[j3 + 1] = y0i - y2i;
-        j0 = m - j;
-        j1 = j0 + m;
-        j2 = j1 + m;
-        j3 = j2 + m;
-        x0r = a[j0] - a[j2 + 1];
-        x0i = a[j0 + 1] + a[j2];
-        x1r = a[j0] + a[j2 + 1];
-        x1i = a[j0 + 1] - a[j2];
-        x2r = a[j1] - a[j3 + 1];
-        x2i = a[j1 + 1] + a[j3];
-        x3r = a[j1] + a[j3 + 1];
-        x3i = a[j1 + 1] - a[j3];
-        y0r = wd1i * x0r - wd1r * x0i;
-        y0i = wd1i * x0i + wd1r * x0r;
-        y2r = wk1i * x2r - wk1r * x2i;
-        y2i = wk1i * x2i + wk1r * x2r;
-        a[j0] = y0r + y2r;
-        a[j0 + 1] = y0i + y2i;
-        a[j1] = y0r - y2r;
-        a[j1 + 1] = y0i - y2i;
-        y0r = wd3i * x1r + wd3r * x1i;
-        y0i = wd3i * x1i - wd3r * x1r;
-        y2r = wk3i * x3r + wk3r * x3i;
-        y2i = wk3i * x3i - wk3r * x3r;
-        a[j2] = y0r + y2r;
-        a[j2 + 1] = y0i + y2i;
-        a[j3] = y0r - y2r;
-        a[j3 + 1] = y0i - y2i;
-    }
-    wk1r = w[m];
-    wk1i = w[m + 1];
-    j0 = mh;
-    j1 = j0 + m;
-    j2 = j1 + m;
-    j3 = j2 + m;
-    x0r = a[j0] - a[j2 + 1];
-    x0i = a[j0 + 1] + a[j2];
-    x1r = a[j0] + a[j2 + 1];
-    x1i = a[j0 + 1] - a[j2];
-    x2r = a[j1] - a[j3 + 1];
-    x2i = a[j1 + 1] + a[j3];
-    x3r = a[j1] + a[j3 + 1];
-    x3i = a[j1 + 1] - a[j3];
-    y0r = wk1r * x0r - wk1i * x0i;
-    y0i = wk1r * x0i + wk1i * x0r;
-    y2r = wk1i * x2r - wk1r * x2i;
-    y2i = wk1i * x2i + wk1r * x2r;
-    a[j0] = y0r + y2r;
-    a[j0 + 1] = y0i + y2i;
-    a[j1] = y0r - y2r;
-    a[j1 + 1] = y0i - y2i;
-    y0r = wk1i * x1r - wk1r * x1i;
-    y0i = wk1i * x1i + wk1r * x1r;
-    y2r = wk1r * x3r - wk1i * x3i;
-    y2i = wk1r * x3i + wk1i * x3r;
-    a[j2] = y0r - y2r;
-    a[j2 + 1] = y0i - y2i;
-    a[j3] = y0r + y2r;
-    a[j3 + 1] = y0i + y2i;
-}
-
-
-void cftfx41(int n, double *a, int nw, double *w)
-{
-    void cftf161(double *a, double *w);
-    void cftf162(double *a, double *w);
-    void cftf081(double *a, double *w);
-    void cftf082(double *a, double *w);
-    
-    if (n == 128) {
-        cftf161(a, &w[nw - 8]);
-        cftf162(&a[32], &w[nw - 32]);
-        cftf161(&a[64], &w[nw - 8]);
-        cftf161(&a[96], &w[nw - 8]);
-    } else {
-        cftf081(a, &w[nw - 8]);
-        cftf082(&a[16], &w[nw - 8]);
-        cftf081(&a[32], &w[nw - 8]);
-        cftf081(&a[48], &w[nw - 8]);
-    }
-}
-
-
-void cftf161(double *a, double *w)
-{
-    double wn4r, wk1r, wk1i, 
-        x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
-        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
-        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, 
-        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, 
-        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
-    
-    wn4r = w[1];
-    wk1r = w[2];
-    wk1i = w[3];
-    x0r = a[0] + a[16];
-    x0i = a[1] + a[17];
-    x1r = a[0] - a[16];
-    x1i = a[1] - a[17];
-    x2r = a[8] + a[24];
-    x2i = a[9] + a[25];
-    x3r = a[8] - a[24];
-    x3i = a[9] - a[25];
-    y0r = x0r + x2r;
-    y0i = x0i + x2i;
-    y4r = x0r - x2r;
-    y4i = x0i - x2i;
-    y8r = x1r - x3i;
-    y8i = x1i + x3r;
-    y12r = x1r + x3i;
-    y12i = x1i - x3r;
-    x0r = a[2] + a[18];
-    x0i = a[3] + a[19];
-    x1r = a[2] - a[18];
-    x1i = a[3] - a[19];
-    x2r = a[10] + a[26];
-    x2i = a[11] + a[27];
-    x3r = a[10] - a[26];
-    x3i = a[11] - a[27];
-    y1r = x0r + x2r;
-    y1i = x0i + x2i;
-    y5r = x0r - x2r;
-    y5i = x0i - x2i;
-    x0r = x1r - x3i;
-    x0i = x1i + x3r;
-    y9r = wk1r * x0r - wk1i * x0i;
-    y9i = wk1r * x0i + wk1i * x0r;
-    x0r = x1r + x3i;
-    x0i = x1i - x3r;
-    y13r = wk1i * x0r - wk1r * x0i;
-    y13i = wk1i * x0i + wk1r * x0r;
-    x0r = a[4] + a[20];
-    x0i = a[5] + a[21];
-    x1r = a[4] - a[20];
-    x1i = a[5] - a[21];
-    x2r = a[12] + a[28];
-    x2i = a[13] + a[29];
-    x3r = a[12] - a[28];
-    x3i = a[13] - a[29];
-    y2r = x0r + x2r;
-    y2i = x0i + x2i;
-    y6r = x0r - x2r;
-    y6i = x0i - x2i;
-    x0r = x1r - x3i;
-    x0i = x1i + x3r;
-    y10r = wn4r * (x0r - x0i);
-    y10i = wn4r * (x0i + x0r);
-    x0r = x1r + x3i;
-    x0i = x1i - x3r;
-    y14r = wn4r * (x0r + x0i);
-    y14i = wn4r * (x0i - x0r);
-    x0r = a[6] + a[22];
-    x0i = a[7] + a[23];
-    x1r = a[6] - a[22];
-    x1i = a[7] - a[23];
-    x2r = a[14] + a[30];
-    x2i = a[15] + a[31];
-    x3r = a[14] - a[30];
-    x3i = a[15] - a[31];
-    y3r = x0r + x2r;
-    y3i = x0i + x2i;
-    y7r = x0r - x2r;
-    y7i = x0i - x2i;
-    x0r = x1r - x3i;
-    x0i = x1i + x3r;
-    y11r = wk1i * x0r - wk1r * x0i;
-    y11i = wk1i * x0i + wk1r * x0r;
-    x0r = x1r + x3i;
-    x0i = x1i - x3r;
-    y15r = wk1r * x0r - wk1i * x0i;
-    y15i = wk1r * x0i + wk1i * x0r;
-    x0r = y12r - y14r;
-    x0i = y12i - y14i;
-    x1r = y12r + y14r;
-    x1i = y12i + y14i;
-    x2r = y13r - y15r;
-    x2i = y13i - y15i;
-    x3r = y13r + y15r;
-    x3i = y13i + y15i;
-    a[24] = x0r + x2r;
-    a[25] = x0i + x2i;
-    a[26] = x0r - x2r;
-    a[27] = x0i - x2i;
-    a[28] = x1r - x3i;
-    a[29] = x1i + x3r;
-    a[30] = x1r + x3i;
-    a[31] = x1i - x3r;
-    x0r = y8r + y10r;
-    x0i = y8i + y10i;
-    x1r = y8r - y10r;
-    x1i = y8i - y10i;
-    x2r = y9r + y11r;
-    x2i = y9i + y11i;
-    x3r = y9r - y11r;
-    x3i = y9i - y11i;
-    a[16] = x0r + x2r;
-    a[17] = x0i + x2i;
-    a[18] = x0r - x2r;
-    a[19] = x0i - x2i;
-    a[20] = x1r - x3i;
-    a[21] = x1i + x3r;
-    a[22] = x1r + x3i;
-    a[23] = x1i - x3r;
-    x0r = y5r - y7i;
-    x0i = y5i + y7r;
-    x2r = wn4r * (x0r - x0i);
-    x2i = wn4r * (x0i + x0r);
-    x0r = y5r + y7i;
-    x0i = y5i - y7r;
-    x3r = wn4r * (x0r - x0i);
-    x3i = wn4r * (x0i + x0r);
-    x0r = y4r - y6i;
-    x0i = y4i + y6r;
-    x1r = y4r + y6i;
-    x1i = y4i - y6r;
-    a[8] = x0r + x2r;
-    a[9] = x0i + x2i;
-    a[10] = x0r - x2r;
-    a[11] = x0i - x2i;
-    a[12] = x1r - x3i;
-    a[13] = x1i + x3r;
-    a[14] = x1r + x3i;
-    a[15] = x1i - x3r;
-    x0r = y0r + y2r;
-    x0i = y0i + y2i;
-    x1r = y0r - y2r;
-    x1i = y0i - y2i;
-    x2r = y1r + y3r;
-    x2i = y1i + y3i;
-    x3r = y1r - y3r;
-    x3i = y1i - y3i;
-    a[0] = x0r + x2r;
-    a[1] = x0i + x2i;
-    a[2] = x0r - x2r;
-    a[3] = x0i - x2i;
-    a[4] = x1r - x3i;
-    a[5] = x1i + x3r;
-    a[6] = x1r + x3i;
-    a[7] = x1i - x3r;
-}
-
-
-void cftf162(double *a, double *w)
-{
-    double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, 
-        x0r, x0i, x1r, x1i, x2r, x2i, 
-        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
-        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, 
-        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, 
-        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
-    
-    wn4r = w[1];
-    wk1r = w[4];
-    wk1i = w[5];
-    wk3r = w[6];
-    wk3i = -w[7];
-    wk2r = w[8];
-    wk2i = w[9];
-    x1r = a[0] - a[17];
-    x1i = a[1] + a[16];
-    x0r = a[8] - a[25];
-    x0i = a[9] + a[24];
-    x2r = wn4r * (x0r - x0i);
-    x2i = wn4r * (x0i + x0r);
-    y0r = x1r + x2r;
-    y0i = x1i + x2i;
-    y4r = x1r - x2r;
-    y4i = x1i - x2i;
-    x1r = a[0] + a[17];
-    x1i = a[1] - a[16];
-    x0r = a[8] + a[25];
-    x0i = a[9] - a[24];
-    x2r = wn4r * (x0r - x0i);
-    x2i = wn4r * (x0i + x0r);
-    y8r = x1r - x2i;
-    y8i = x1i + x2r;
-    y12r = x1r + x2i;
-    y12i = x1i - x2r;
-    x0r = a[2] - a[19];
-    x0i = a[3] + a[18];
-    x1r = wk1r * x0r - wk1i * x0i;
-    x1i = wk1r * x0i + wk1i * x0r;
-    x0r = a[10] - a[27];
-    x0i = a[11] + a[26];
-    x2r = wk3i * x0r - wk3r * x0i;
-    x2i = wk3i * x0i + wk3r * x0r;
-    y1r = x1r + x2r;
-    y1i = x1i + x2i;
-    y5r = x1r - x2r;
-    y5i = x1i - x2i;
-    x0r = a[2] + a[19];
-    x0i = a[3] - a[18];
-    x1r = wk3r * x0r - wk3i * x0i;
-    x1i = wk3r * x0i + wk3i * x0r;
-    x0r = a[10] + a[27];
-    x0i = a[11] - a[26];
-    x2r = wk1r * x0r + wk1i * x0i;
-    x2i = wk1r * x0i - wk1i * x0r;
-    y9r = x1r - x2r;
-    y9i = x1i - x2i;
-    y13r = x1r + x2r;
-    y13i = x1i + x2i;
-    x0r = a[4] - a[21];
-    x0i = a[5] + a[20];
-    x1r = wk2r * x0r - wk2i * x0i;
-    x1i = wk2r * x0i + wk2i * x0r;
-    x0r = a[12] - a[29];
-    x0i = a[13] + a[28];
-    x2r = wk2i * x0r - wk2r * x0i;
-    x2i = wk2i * x0i + wk2r * x0r;
-    y2r = x1r + x2r;
-    y2i = x1i + x2i;
-    y6r = x1r - x2r;
-    y6i = x1i - x2i;
-    x0r = a[4] + a[21];
-    x0i = a[5] - a[20];
-    x1r = wk2i * x0r - wk2r * x0i;
-    x1i = wk2i * x0i + wk2r * x0r;
-    x0r = a[12] + a[29];
-    x0i = a[13] - a[28];
-    x2r = wk2r * x0r - wk2i * x0i;
-    x2i = wk2r * x0i + wk2i * x0r;
-    y10r = x1r - x2r;
-    y10i = x1i - x2i;
-    y14r = x1r + x2r;
-    y14i = x1i + x2i;
-    x0r = a[6] - a[23];
-    x0i = a[7] + a[22];
-    x1r = wk3r * x0r - wk3i * x0i;
-    x1i = wk3r * x0i + wk3i * x0r;
-    x0r = a[14] - a[31];
-    x0i = a[15] + a[30];
-    x2r = wk1i * x0r - wk1r * x0i;
-    x2i = wk1i * x0i + wk1r * x0r;
-    y3r = x1r + x2r;
-    y3i = x1i + x2i;
-    y7r = x1r - x2r;
-    y7i = x1i - x2i;
-    x0r = a[6] + a[23];
-    x0i = a[7] - a[22];
-    x1r = wk1i * x0r + wk1r * x0i;
-    x1i = wk1i * x0i - wk1r * x0r;
-    x0r = a[14] + a[31];
-    x0i = a[15] - a[30];
-    x2r = wk3i * x0r - wk3r * x0i;
-    x2i = wk3i * x0i + wk3r * x0r;
-    y11r = x1r + x2r;
-    y11i = x1i + x2i;
-    y15r = x1r - x2r;
-    y15i = x1i - x2i;
-    x1r = y0r + y2r;
-    x1i = y0i + y2i;
-    x2r = y1r + y3r;
-    x2i = y1i + y3i;
-    a[0] = x1r + x2r;
-    a[1] = x1i + x2i;
-    a[2] = x1r - x2r;
-    a[3] = x1i - x2i;
-    x1r = y0r - y2r;
-    x1i = y0i - y2i;
-    x2r = y1r - y3r;
-    x2i = y1i - y3i;
-    a[4] = x1r - x2i;
-    a[5] = x1i + x2r;
-    a[6] = x1r + x2i;
-    a[7] = x1i - x2r;
-    x1r = y4r - y6i;
-    x1i = y4i + y6r;
-    x0r = y5r - y7i;
-    x0i = y5i + y7r;
-    x2r = wn4r * (x0r - x0i);
-    x2i = wn4r * (x0i + x0r);
-    a[8] = x1r + x2r;
-    a[9] = x1i + x2i;
-    a[10] = x1r - x2r;
-    a[11] = x1i - x2i;
-    x1r = y4r + y6i;
-    x1i = y4i - y6r;
-    x0r = y5r + y7i;
-    x0i = y5i - y7r;
-    x2r = wn4r * (x0r - x0i);
-    x2i = wn4r * (x0i + x0r);
-    a[12] = x1r - x2i;
-    a[13] = x1i + x2r;
-    a[14] = x1r + x2i;
-    a[15] = x1i - x2r;
-    x1r = y8r + y10r;
-    x1i = y8i + y10i;
-    x2r = y9r - y11r;
-    x2i = y9i - y11i;
-    a[16] = x1r + x2r;
-    a[17] = x1i + x2i;
-    a[18] = x1r - x2r;
-    a[19] = x1i - x2i;
-    x1r = y8r - y10r;
-    x1i = y8i - y10i;
-    x2r = y9r + y11r;
-    x2i = y9i + y11i;
-    a[20] = x1r - x2i;
-    a[21] = x1i + x2r;
-    a[22] = x1r + x2i;
-    a[23] = x1i - x2r;
-    x1r = y12r - y14i;
-    x1i = y12i + y14r;
-    x0r = y13r + y15i;
-    x0i = y13i - y15r;
-    x2r = wn4r * (x0r - x0i);
-    x2i = wn4r * (x0i + x0r);
-    a[24] = x1r + x2r;
-    a[25] = x1i + x2i;
-    a[26] = x1r - x2r;
-    a[27] = x1i - x2i;
-    x1r = y12r + y14i;
-    x1i = y12i - y14r;
-    x0r = y13r - y15i;
-    x0i = y13i + y15r;
-    x2r = wn4r * (x0r - x0i);
-    x2i = wn4r * (x0i + x0r);
-    a[28] = x1r - x2i;
-    a[29] = x1i + x2r;
-    a[30] = x1r + x2i;
-    a[31] = x1i - x2r;
-}
-
-
-void cftf081(double *a, double *w)
-{
-    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
-        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
-        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
-    
-    wn4r = w[1];
-    x0r = a[0] + a[8];
-    x0i = a[1] + a[9];
-    x1r = a[0] - a[8];
-    x1i = a[1] - a[9];
-    x2r = a[4] + a[12];
-    x2i = a[5] + a[13];
-    x3r = a[4] - a[12];
-    x3i = a[5] - a[13];
-    y0r = x0r + x2r;
-    y0i = x0i + x2i;
-    y2r = x0r - x2r;
-    y2i = x0i - x2i;
-    y1r = x1r - x3i;
-    y1i = x1i + x3r;
-    y3r = x1r + x3i;
-    y3i = x1i - x3r;
-    x0r = a[2] + a[10];
-    x0i = a[3] + a[11];
-    x1r = a[2] - a[10];
-    x1i = a[3] - a[11];
-    x2r = a[6] + a[14];
-    x2i = a[7] + a[15];
-    x3r = a[6] - a[14];
-    x3i = a[7] - a[15];
-    y4r = x0r + x2r;
-    y4i = x0i + x2i;
-    y6r = x0r - x2r;
-    y6i = x0i - x2i;
-    x0r = x1r - x3i;
-    x0i = x1i + x3r;
-    x2r = x1r + x3i;
-    x2i = x1i - x3r;
-    y5r = wn4r * (x0r - x0i);
-    y5i = wn4r * (x0r + x0i);
-    y7r = wn4r * (x2r - x2i);
-    y7i = wn4r * (x2r + x2i);
-    a[8] = y1r + y5r;
-    a[9] = y1i + y5i;
-    a[10] = y1r - y5r;
-    a[11] = y1i - y5i;
-    a[12] = y3r - y7i;
-    a[13] = y3i + y7r;
-    a[14] = y3r + y7i;
-    a[15] = y3i - y7r;
-    a[0] = y0r + y4r;
-    a[1] = y0i + y4i;
-    a[2] = y0r - y4r;
-    a[3] = y0i - y4i;
-    a[4] = y2r - y6i;
-    a[5] = y2i + y6r;
-    a[6] = y2r + y6i;
-    a[7] = y2i - y6r;
-}
-
-
-void cftf082(double *a, double *w)
-{
-    double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, 
-        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
-        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
-    
-    wn4r = w[1];
-    wk1r = w[2];
-    wk1i = w[3];
-    y0r = a[0] - a[9];
-    y0i = a[1] + a[8];
-    y1r = a[0] + a[9];
-    y1i = a[1] - a[8];
-    x0r = a[4] - a[13];
-    x0i = a[5] + a[12];
-    y2r = wn4r * (x0r - x0i);
-    y2i = wn4r * (x0i + x0r);
-    x0r = a[4] + a[13];
-    x0i = a[5] - a[12];
-    y3r = wn4r * (x0r - x0i);
-    y3i = wn4r * (x0i + x0r);
-    x0r = a[2] - a[11];
-    x0i = a[3] + a[10];
-    y4r = wk1r * x0r - wk1i * x0i;
-    y4i = wk1r * x0i + wk1i * x0r;
-    x0r = a[2] + a[11];
-    x0i = a[3] - a[10];
-    y5r = wk1i * x0r - wk1r * x0i;
-    y5i = wk1i * x0i + wk1r * x0r;
-    x0r = a[6] - a[15];
-    x0i = a[7] + a[14];
-    y6r = wk1i * x0r - wk1r * x0i;
-    y6i = wk1i * x0i + wk1r * x0r;
-    x0r = a[6] + a[15];
-    x0i = a[7] - a[14];
-    y7r = wk1r * x0r - wk1i * x0i;
-    y7i = wk1r * x0i + wk1i * x0r;
-    x0r = y0r + y2r;
-    x0i = y0i + y2i;
-    x1r = y4r + y6r;
-    x1i = y4i + y6i;
-    a[0] = x0r + x1r;
-    a[1] = x0i + x1i;
-    a[2] = x0r - x1r;
-    a[3] = x0i - x1i;
-    x0r = y0r - y2r;
-    x0i = y0i - y2i;
-    x1r = y4r - y6r;
-    x1i = y4i - y6i;
-    a[4] = x0r - x1i;
-    a[5] = x0i + x1r;
-    a[6] = x0r + x1i;
-    a[7] = x0i - x1r;
-    x0r = y1r - y3i;
-    x0i = y1i + y3r;
-    x1r = y5r - y7r;
-    x1i = y5i - y7i;
-    a[8] = x0r + x1r;
-    a[9] = x0i + x1i;
-    a[10] = x0r - x1r;
-    a[11] = x0i - x1i;
-    x0r = y1r + y3i;
-    x0i = y1i - y3r;
-    x1r = y5r + y7r;
-    x1i = y5i + y7i;
-    a[12] = x0r - x1i;
-    a[13] = x0i + x1r;
-    a[14] = x0r + x1i;
-    a[15] = x0i - x1r;
-}
-
-
-void cftf040(double *a)
-{
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    
-    x0r = a[0] + a[4];
-    x0i = a[1] + a[5];
-    x1r = a[0] - a[4];
-    x1i = a[1] - a[5];
-    x2r = a[2] + a[6];
-    x2i = a[3] + a[7];
-    x3r = a[2] - a[6];
-    x3i = a[3] - a[7];
-    a[0] = x0r + x2r;
-    a[1] = x0i + x2i;
-    a[2] = x1r - x3i;
-    a[3] = x1i + x3r;
-    a[4] = x0r - x2r;
-    a[5] = x0i - x2i;
-    a[6] = x1r + x3i;
-    a[7] = x1i - x3r;
-}
-
-
-void cftb040(double *a)
-{
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    
-    x0r = a[0] + a[4];
-    x0i = a[1] + a[5];
-    x1r = a[0] - a[4];
-    x1i = a[1] - a[5];
-    x2r = a[2] + a[6];
-    x2i = a[3] + a[7];
-    x3r = a[2] - a[6];
-    x3i = a[3] - a[7];
-    a[0] = x0r + x2r;
-    a[1] = x0i + x2i;
-    a[2] = x1r + x3i;
-    a[3] = x1i - x3r;
-    a[4] = x0r - x2r;
-    a[5] = x0i - x2i;
-    a[6] = x1r - x3i;
-    a[7] = x1i + x3r;
-}
-
-
-void cftx020(double *a)
-{
-    double x0r, x0i;
-    
-    x0r = a[0] - a[2];
-    x0i = a[1] - a[3];
-    a[0] += a[2];
-    a[1] += a[3];
-    a[2] = x0r;
-    a[3] = x0i;
-}
-
-
-void rftfsub(int n, double *a, int nc, double *c)
-{
-    int j, k, kk, ks, m;
-    double wkr, wki, xr, xi, yr, yi;
-    
-    m = n >> 1;
-    ks = 2 * nc / m;
-    kk = 0;
-    for (j = 2; j < m; j += 2) {
-        k = n - j;
-        kk += ks;
-        wkr = 0.5 - c[nc - kk];
-        wki = c[kk];
-        xr = a[j] - a[k];
-        xi = a[j + 1] + a[k + 1];
-        yr = wkr * xr - wki * xi;
-        yi = wkr * xi + wki * xr;
-        a[j] -= yr;
-        a[j + 1] -= yi;
-        a[k] += yr;
-        a[k + 1] -= yi;
-    }
-}
-
-
-void rftbsub(int n, double *a, int nc, double *c)
-{
-    int j, k, kk, ks, m;
-    double wkr, wki, xr, xi, yr, yi;
-    
-    m = n >> 1;
-    ks = 2 * nc / m;
-    kk = 0;
-    for (j = 2; j < m; j += 2) {
-        k = n - j;
-        kk += ks;
-        wkr = 0.5 - c[nc - kk];
-        wki = c[kk];
-        xr = a[j] - a[k];
-        xi = a[j + 1] + a[k + 1];
-        yr = wkr * xr + wki * xi;
-        yi = wkr * xi - wki * xr;
-        a[j] -= yr;
-        a[j + 1] -= yi;
-        a[k] += yr;
-        a[k + 1] -= yi;
-    }
-}
-
-
-void dctsub(int n, double *a, int nc, double *c)
-{
-    int j, k, kk, ks, m;
-    double wkr, wki, xr;
-    
-    m = n >> 1;
-    ks = nc / n;
-    kk = 0;
-    for (j = 1; j < m; j++) {
-        k = n - j;
-        kk += ks;
-        wkr = c[kk] - c[nc - kk];
-        wki = c[kk] + c[nc - kk];
-        xr = wki * a[j] - wkr * a[k];
-        a[j] = wkr * a[j] + wki * a[k];
-        a[k] = xr;
-    }
-    a[m] *= c[0];
-}
-
-
-void dstsub(int n, double *a, int nc, double *c)
-{
-    int j, k, kk, ks, m;
-    double wkr, wki, xr;
-    
-    m = n >> 1;
-    ks = nc / n;
-    kk = 0;
-    for (j = 1; j < m; j++) {
-        k = n - j;
-        kk += ks;
-        wkr = c[kk] - c[nc - kk];
-        wki = c[kk] + c[nc - kk];
-        xr = wki * a[k] - wkr * a[j];
-        a[k] = wkr * a[k] + wki * a[j];
-        a[j] = xr;
-    }
-    a[m] *= c[0];
-}
-
diff --git a/third_party/tensorflow_dependencies/fft2d/fftsg.f b/third_party/tensorflow_dependencies/fft2d/fftsg.f
deleted file mode 100644
index 6fb26c4..0000000
--- a/third_party/tensorflow_dependencies/fft2d/fftsg.f
+++ /dev/null
@@ -1,2967 +0,0 @@
-! Fast Fourier/Cosine/Sine Transform
-!     dimension   :one
-!     data length :power of 2
-!     decimation  :frequency
-!     radix       :split-radix
-!     data        :inplace
-!     table       :use
-! subroutines
-!     cdft: Complex Discrete Fourier Transform
-!     rdft: Real Discrete Fourier Transform
-!     ddct: Discrete Cosine Transform
-!     ddst: Discrete Sine Transform
-!     dfct: Cosine Transform of RDFT (Real Symmetric DFT)
-!     dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
-!
-!
-! -------- Complex DFT (Discrete Fourier Transform) --------
-!     [definition]
-!         <case1>
-!             X(k) = sum_j=0^n-1 x(j)*exp(2*pi*i*j*k/n), 0<=k<n
-!         <case2>
-!             X(k) = sum_j=0^n-1 x(j)*exp(-2*pi*i*j*k/n), 0<=k<n
-!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call cdft(2*n, 1, a, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call cdft(2*n, -1, a, ip, w)
-!     [parameters]
-!         2*n          :data length (integer)
-!                       n >= 1, n = power of 2
-!         a(0:2*n-1)   :input/output data (real*8)
-!                       input data
-!                           a(2*j) = Re(x(j)), 
-!                           a(2*j+1) = Im(x(j)), 0<=j<n
-!                       output data
-!                           a(2*k) = Re(X(k)), 
-!                           a(2*k+1) = Im(X(k)), 0<=k<n
-!         ip(0:*)      :work area for bit reversal (integer)
-!                       length of ip >= 2+sqrt(n)
-!                       strictly, 
-!                       length of ip >= 
-!                           2+2**(int(log(n+0.5)/log(2.0))/2).
-!                       ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:n/2-1)   :cos/sin table (real*8)
-!                       w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call cdft(2*n, -1, a, ip, w)
-!         is 
-!             call cdft(2*n, 1, a, ip, w)
-!             do j = 0, 2 * n - 1
-!                 a(j) = a(j) / n
-!             end do
-!         .
-!
-!
-! -------- Real DFT / Inverse of Real DFT --------
-!     [definition]
-!         <case1> RDFT
-!             R(k) = sum_j=0^n-1 a(j)*cos(2*pi*j*k/n), 0<=k<=n/2
-!             I(k) = sum_j=0^n-1 a(j)*sin(2*pi*j*k/n), 0<k<n/2
-!         <case2> IRDFT (excluding scale)
-!             a(k) = (R(0) + R(n/2)*cos(pi*k))/2 + 
-!                    sum_j=1^n/2-1 R(j)*cos(2*pi*j*k/n) + 
-!                    sum_j=1^n/2-1 I(j)*sin(2*pi*j*k/n), 0<=k<n
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call rdft(n, 1, a, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call rdft(n, -1, a, ip, w)
-!     [parameters]
-!         n            :data length (integer)
-!                       n >= 2, n = power of 2
-!         a(0:n-1)     :input/output data (real*8)
-!                       <case1>
-!                           output data
-!                               a(2*k) = R(k), 0<=k<n/2
-!                               a(2*k+1) = I(k), 0<k<n/2
-!                               a(1) = R(n/2)
-!                       <case2>
-!                           input data
-!                               a(2*j) = R(j), 0<=j<n/2
-!                               a(2*j+1) = I(j), 0<j<n/2
-!                               a(1) = R(n/2)
-!         ip(0:*)      :work area for bit reversal (integer)
-!                       length of ip >= 2+sqrt(n/2)
-!                       strictly, 
-!                       length of ip >= 
-!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
-!                       ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:n/2-1)   :cos/sin table (real*8)
-!                       w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call rdft(n, 1, a, ip, w)
-!         is 
-!             call rdft(n, -1, a, ip, w)
-!             do j = 0, n - 1
-!                 a(j) = a(j) * 2 / n
-!             end do
-!         .
-!
-!
-! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
-!     [definition]
-!         <case1> IDCT (excluding scale)
-!             C(k) = sum_j=0^n-1 a(j)*cos(pi*j*(k+1/2)/n), 0<=k<n
-!         <case2> DCT
-!             C(k) = sum_j=0^n-1 a(j)*cos(pi*(j+1/2)*k/n), 0<=k<n
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call ddct(n, 1, a, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call ddct(n, -1, a, ip, w)
-!     [parameters]
-!         n            :data length (integer)
-!                       n >= 2, n = power of 2
-!         a(0:n-1)     :input/output data (real*8)
-!                       output data
-!                           a(k) = C(k), 0<=k<n
-!         ip(0:*)      :work area for bit reversal (integer)
-!                       length of ip >= 2+sqrt(n/2)
-!                       strictly, 
-!                       length of ip >= 
-!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
-!                       ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:n*5/4-1) :cos/sin table (real*8)
-!                       w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call ddct(n, -1, a, ip, w)
-!         is 
-!             a(0) = a(0) / 2
-!             call ddct(n, 1, a, ip, w)
-!             do j = 0, n - 1
-!                 a(j) = a(j) * 2 / n
-!             end do
-!         .
-!
-!
-! -------- DST (Discrete Sine Transform) / Inverse of DST --------
-!     [definition]
-!         <case1> IDST (excluding scale)
-!             S(k) = sum_j=1^n A(j)*sin(pi*j*(k+1/2)/n), 0<=k<n
-!         <case2> DST
-!             S(k) = sum_j=0^n-1 a(j)*sin(pi*(j+1/2)*k/n), 0<k<=n
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call ddst(n, 1, a, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call ddst(n, -1, a, ip, w)
-!     [parameters]
-!         n            :data length (integer)
-!                       n >= 2, n = power of 2
-!         a(0:n-1)     :input/output data (real*8)
-!                       <case1>
-!                           input data
-!                               a(j) = A(j), 0<j<n
-!                               a(0) = A(n)
-!                           output data
-!                               a(k) = S(k), 0<=k<n
-!                       <case2>
-!                           output data
-!                               a(k) = S(k), 0<k<n
-!                               a(0) = S(n)
-!         ip(0:*)      :work area for bit reversal (integer)
-!                       length of ip >= 2+sqrt(n/2)
-!                       strictly, 
-!                       length of ip >= 
-!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
-!                       ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:n*5/4-1) :cos/sin table (real*8)
-!                       w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call ddst(n, -1, a, ip, w)
-!         is 
-!             a(0) = a(0) / 2
-!             call ddst(n, 1, a, ip, w)
-!             do j = 0, n - 1
-!                 a(j) = a(j) * 2 / n
-!             end do
-!         .
-!
-!
-! -------- Cosine Transform of RDFT (Real Symmetric DFT) --------
-!     [definition]
-!         C(k) = sum_j=0^n a(j)*cos(pi*j*k/n), 0<=k<=n
-!     [usage]
-!         ip(0) = 0  ! first time only
-!         call dfct(n, a, t, ip, w)
-!     [parameters]
-!         n            :data length - 1 (integer)
-!                       n >= 2, n = power of 2
-!         a(0:n)       :input/output data (real*8)
-!                       output data
-!                           a(k) = C(k), 0<=k<=n
-!         t(0:n/2)     :work area (real*8)
-!         ip(0:*)      :work area for bit reversal (integer)
-!                       length of ip >= 2+sqrt(n/4)
-!                       strictly, 
-!                       length of ip >= 
-!                           2+2**(int(log(n/4+0.5)/log(2.0))/2).
-!                       ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:n*5/8-1) :cos/sin table (real*8)
-!                       w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             a(0) = a(0) / 2
-!             a(n) = a(n) / 2
-!             call dfct(n, a, t, ip, w)
-!         is 
-!             a(0) = a(0) / 2
-!             a(n) = a(n) / 2
-!             call dfct(n, a, t, ip, w)
-!             do j = 0, n
-!                 a(j) = a(j) * 2 / n
-!             end do
-!         .
-!
-!
-! -------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
-!     [definition]
-!         S(k) = sum_j=1^n-1 a(j)*sin(pi*j*k/n), 0<k<n
-!     [usage]
-!         ip(0) = 0  ! first time only
-!         call dfst(n, a, t, ip, w)
-!     [parameters]
-!         n            :data length + 1 (integer)
-!                       n >= 2, n = power of 2
-!         a(0:n-1)     :input/output data (real*8)
-!                       output data
-!                           a(k) = S(k), 0<k<n
-!                       (a(0) is used for work area)
-!         t(0:n/2-1)   :work area (real*8)
-!         ip(0:*)      :work area for bit reversal (integer)
-!                       length of ip >= 2+sqrt(n/4)
-!                       strictly, 
-!                       length of ip >= 
-!                           2+2**(int(log(n/4+0.5)/log(2.0))/2).
-!                       ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:n*5/8-1) :cos/sin table (real*8)
-!                       w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call dfst(n, a, t, ip, w)
-!         is 
-!             call dfst(n, a, t, ip, w)
-!             do j = 1, n - 1
-!                 a(j) = a(j) * 2 / n
-!             end do
-!         .
-!
-!
-! Appendix :
-!     The cos/sin table is recalculated when the larger table required.
-!     w() and ip() are compatible with all routines.
-!
-!
-      subroutine cdft(n, isgn, a, ip, w)
-      integer n, isgn, ip(0 : *), nw
-      real*8 a(0 : n - 1), w(0 : *)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      if (isgn .ge. 0) then
-          call cftfsub(n, a, ip, nw, w)
-      else
-          call cftbsub(n, a, ip, nw, w)
-      end if
-      end
-!
-      subroutine rdft(n, isgn, a, ip, w)
-      integer n, isgn, ip(0 : *), nw, nc
-      real*8 a(0 : n - 1), w(0 : *), xi
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. 4 * nc) then
-          nc = n / 4
-          call makect(nc, ip, w(nw))
-      end if
-      if (isgn .ge. 0) then
-          if (n .gt. 4) then
-              call cftfsub(n, a, ip, nw, w)
-              call rftfsub(n, a, nc, w(nw))
-          else if (n .eq. 4) then
-              call cftfsub(n, a, ip, nw, w)
-          end if
-          xi = a(0) - a(1)
-          a(0) = a(0) + a(1)
-          a(1) = xi
-      else
-          a(1) = 0.5d0 * (a(0) - a(1))
-          a(0) = a(0) - a(1)
-          if (n .gt. 4) then
-              call rftbsub(n, a, nc, w(nw))
-              call cftbsub(n, a, ip, nw, w)
-          else if (n .eq. 4) then
-              call cftbsub(n, a, ip, nw, w)
-          end if
-      end if
-      end
-!
-      subroutine ddct(n, isgn, a, ip, w)
-      integer n, isgn, ip(0 : *), j, nw, nc
-      real*8 a(0 : n - 1), w(0 : *), xr
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. nc) then
-          nc = n
-          call makect(nc, ip, w(nw))
-      end if
-      if (isgn .lt. 0) then
-          xr = a(n - 1)
-          do j = n - 2, 2, -2
-              a(j + 1) = a(j) - a(j - 1)
-              a(j) = a(j) + a(j - 1)
-          end do
-          a(1) = a(0) - xr
-          a(0) = a(0) + xr
-          if (n .gt. 4) then
-              call rftbsub(n, a, nc, w(nw))
-              call cftbsub(n, a, ip, nw, w)
-          else if (n .eq. 4) then
-              call cftbsub(n, a, ip, nw, w)
-          end if
-      end if
-      call dctsub(n, a, nc, w(nw))
-      if (isgn .ge. 0) then
-          if (n .gt. 4) then
-              call cftfsub(n, a, ip, nw, w)
-              call rftfsub(n, a, nc, w(nw))
-          else if (n .eq. 4) then
-              call cftfsub(n, a, ip, nw, w)
-          end if
-          xr = a(0) - a(1)
-          a(0) = a(0) + a(1)
-          do j = 2, n - 2, 2
-              a(j - 1) = a(j) - a(j + 1)
-              a(j) = a(j) + a(j + 1)
-          end do
-          a(n - 1) = xr
-      end if
-      end
-!
-      subroutine ddst(n, isgn, a, ip, w)
-      integer n, isgn, ip(0 : *), j, nw, nc
-      real*8 a(0 : n - 1), w(0 : *), xr
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. nc) then
-          nc = n
-          call makect(nc, ip, w(nw))
-      end if
-      if (isgn .lt. 0) then
-          xr = a(n - 1)
-          do j = n - 2, 2, -2
-              a(j + 1) = -a(j) - a(j - 1)
-              a(j) = a(j) - a(j - 1)
-          end do
-          a(1) = a(0) + xr
-          a(0) = a(0) - xr
-          if (n .gt. 4) then
-              call rftbsub(n, a, nc, w(nw))
-              call cftbsub(n, a, ip, nw, w)
-          else if (n .eq. 4) then
-              call cftbsub(n, a, ip, nw, w)
-          end if
-      end if
-      call dstsub(n, a, nc, w(nw))
-      if (isgn .ge. 0) then
-          if (n .gt. 4) then
-              call cftfsub(n, a, ip, nw, w)
-              call rftfsub(n, a, nc, w(nw))
-          else if (n .eq. 4) then
-              call cftfsub(n, a, ip, nw, w)
-          end if
-          xr = a(0) - a(1)
-          a(0) = a(0) + a(1)
-          do j = 2, n - 2, 2
-              a(j - 1) = -a(j) - a(j + 1)
-              a(j) = a(j) - a(j + 1)
-          end do
-          a(n - 1) = -xr
-      end if
-      end
-!
-      subroutine dfct(n, a, t, ip, w)
-      integer n, ip(0 : *), j, k, l, m, mh, nw, nc
-      real*8 a(0 : n), t(0 : n / 2), w(0 : *), xr, xi, yr, yi
-      nw = ip(0)
-      if (n .gt. 8 * nw) then
-          nw = n / 8
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. 2 * nc) then
-          nc = n / 2
-          call makect(nc, ip, w(nw))
-      end if
-      m = n / 2
-      yi = a(m)
-      xi = a(0) + a(n)
-      a(0) = a(0) - a(n)
-      t(0) = xi - yi
-      t(m) = xi + yi
-      if (n .gt. 2) then
-          mh = m / 2
-          do j = 1, mh - 1
-              k = m - j
-              xr = a(j) - a(n - j)
-              xi = a(j) + a(n - j)
-              yr = a(k) - a(n - k)
-              yi = a(k) + a(n - k)
-              a(j) = xr
-              a(k) = yr
-              t(j) = xi - yi
-              t(k) = xi + yi
-          end do
-          t(mh) = a(mh) + a(n - mh)
-          a(mh) = a(mh) - a(n - mh)
-          call dctsub(m, a, nc, w(nw))
-          if (m .gt. 4) then
-              call cftfsub(m, a, ip, nw, w)
-              call rftfsub(m, a, nc, w(nw))
-          else if (m .eq. 4) then
-              call cftfsub(m, a, ip, nw, w)
-          end if
-          a(n - 1) = a(0) - a(1)
-          a(1) = a(0) + a(1)
-          do j = m - 2, 2, -2
-              a(2 * j + 1) = a(j) + a(j + 1)
-              a(2 * j - 1) = a(j) - a(j + 1)
-          end do
-          l = 2
-          m = mh
-          do while (m .ge. 2)
-              call dctsub(m, t, nc, w(nw))
-              if (m .gt. 4) then
-                  call cftfsub(m, t, ip, nw, w)
-                  call rftfsub(m, t, nc, w(nw))
-              else if (m .eq. 4) then
-                  call cftfsub(m, t, ip, nw, w)
-              end if
-              a(n - l) = t(0) - t(1)
-              a(l) = t(0) + t(1)
-              k = 0
-              do j = 2, m - 2, 2
-                  k = k + 4 * l
-                  a(k - l) = t(j) - t(j + 1)
-                  a(k + l) = t(j) + t(j + 1)
-              end do
-              l = 2 * l
-              mh = m / 2
-              do j = 0, mh - 1
-                  k = m - j
-                  t(j) = t(m + k) - t(m + j)
-                  t(k) = t(m + k) + t(m + j)
-              end do
-              t(mh) = t(m + mh)
-              m = mh
-          end do
-          a(l) = t(0)
-          a(n) = t(2) - t(1)
-          a(0) = t(2) + t(1)
-      else
-          a(1) = a(0)
-          a(2) = t(0)
-          a(0) = t(1)
-      end if
-      end
-!
-      subroutine dfst(n, a, t, ip, w)
-      integer n, ip(0 : *), j, k, l, m, mh, nw, nc
-      real*8 a(0 : n - 1), t(0 : n / 2 - 1), w(0 : *), xr, xi, yr, yi
-      nw = ip(0)
-      if (n .gt. 8 * nw) then
-          nw = n / 8
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. 2 * nc) then
-          nc = n / 2
-          call makect(nc, ip, w(nw))
-      end if
-      if (n .gt. 2) then
-          m = n / 2
-          mh = m / 2
-          do j = 1, mh - 1
-              k = m - j
-              xr = a(j) + a(n - j)
-              xi = a(j) - a(n - j)
-              yr = a(k) + a(n - k)
-              yi = a(k) - a(n - k)
-              a(j) = xr
-              a(k) = yr
-              t(j) = xi + yi
-              t(k) = xi - yi
-          end do
-          t(0) = a(mh) - a(n - mh)
-          a(mh) = a(mh) + a(n - mh)
-          a(0) = a(m)
-          call dstsub(m, a, nc, w(nw))
-          if (m .gt. 4) then
-              call cftfsub(m, a, ip, nw, w)
-              call rftfsub(m, a, nc, w(nw))
-          else if (m .eq. 4) then
-              call cftfsub(m, a, ip, nw, w)
-          end if
-          a(n - 1) = a(1) - a(0)
-          a(1) = a(0) + a(1)
-          do j = m - 2, 2, -2
-              a(2 * j + 1) = a(j) - a(j + 1)
-              a(2 * j - 1) = -a(j) - a(j + 1)
-          end do
-          l = 2
-          m = mh
-          do while (m .ge. 2)
-              call dstsub(m, t, nc, w(nw))
-              if (m .gt. 4) then
-                  call cftfsub(m, t, ip, nw, w)
-                  call rftfsub(m, t, nc, w(nw))
-              else if (m .eq. 4) then
-                  call cftfsub(m, t, ip, nw, w)
-              end if
-              a(n - l) = t(1) - t(0)
-              a(l) = t(0) + t(1)
-              k = 0
-              do j = 2, m - 2, 2
-                  k = k + 4 * l
-                  a(k - l) = -t(j) - t(j + 1)
-                  a(k + l) = t(j) - t(j + 1)
-              end do
-              l = 2 * l
-              mh = m / 2
-              do j = 1, mh - 1
-                  k = m - j
-                  t(j) = t(m + k) + t(m + j)
-                  t(k) = t(m + k) - t(m + j)
-              end do
-              t(0) = t(m + mh)
-              m = mh
-          end do
-          a(l) = t(0)
-      end if
-      a(0) = 0
-      end
-!
-! -------- initializing routines --------
-!
-      subroutine makewt(nw, ip, w)
-      integer nw, ip(0 : *), j, nwh, nw0, nw1
-      real*8 w(0 : nw - 1), delta, wn4r, wk1r, wk1i, wk3r, wk3i
-      ip(0) = nw
-      ip(1) = 1
-      if (nw .gt. 2) then
-          nwh = nw / 2
-          delta = atan(1.0d0) / nwh
-          wn4r = cos(delta * nwh)
-          w(0) = 1
-          w(1) = wn4r
-          if (nwh .eq. 4) then
-              w(2) = cos(delta * 2)
-              w(3) = sin(delta * 2)
-          else if (nwh .gt. 4) then
-              call makeipt(nw, ip)
-              w(2) = 0.5d0 / cos(delta * 2)
-              w(3) = 0.5d0 / cos(delta * 6)
-              do j = 4, nwh - 4, 4
-                  w(j) = cos(delta * j)
-                  w(j + 1) = sin(delta * j)
-                  w(j + 2) = cos(3 * delta * j)
-                  w(j + 3) = -sin(3 * delta * j)
-              end do
-          end if
-          nw0 = 0
-          do while (nwh .gt. 2)
-              nw1 = nw0 + nwh
-              nwh = nwh / 2
-              w(nw1) = 1
-              w(nw1 + 1) = wn4r
-              if (nwh .eq. 4) then
-                  wk1r = w(nw0 + 4)
-                  wk1i = w(nw0 + 5)
-                  w(nw1 + 2) = wk1r
-                  w(nw1 + 3) = wk1i
-              else if (nwh .gt. 4) then
-                  wk1r = w(nw0 + 4)
-                  wk3r = w(nw0 + 6)
-                  w(nw1 + 2) = 0.5d0 / wk1r
-                  w(nw1 + 3) = 0.5d0 / wk3r
-                  do j = 4, nwh - 4, 4
-                      wk1r = w(nw0 + 2 * j)
-                      wk1i = w(nw0 + 2 * j + 1)
-                      wk3r = w(nw0 + 2 * j + 2)
-                      wk3i = w(nw0 + 2 * j + 3)
-                      w(nw1 + j) = wk1r
-                      w(nw1 + j + 1) = wk1i
-                      w(nw1 + j + 2) = wk3r
-                      w(nw1 + j + 3) = wk3i
-                  end do
-              end if
-              nw0 = nw1
-          end do
-      end if
-      end
-!
-      subroutine makeipt(nw, ip)
-      integer nw, ip(0 : *), j, l, m, m2, p, q
-      ip(2) = 0
-      ip(3) = 16
-      m = 2
-      l = nw
-      do while (l .gt. 32)
-          m2 = 2 * m
-          q = 8 * m2
-          do j = m, m2 - 1
-              p = 4 * ip(j)
-              ip(m + j) = p
-              ip(m2 + j) = p + q
-          end do
-          m = m2
-          l = l / 4
-      end do
-      end
-!
-      subroutine makect(nc, ip, c)
-      integer nc, ip(0 : *), j, nch
-      real*8 c(0 : nc - 1), delta
-      ip(1) = nc
-      if (nc .gt. 1) then
-          nch = nc / 2
-          delta = atan(1.0d0) / nch
-          c(0) = cos(delta * nch)
-          c(nch) = 0.5d0 * c(0)
-          do j = 1, nch - 1
-              c(j) = 0.5d0 * cos(delta * j)
-              c(nc - j) = 0.5d0 * sin(delta * j)
-          end do
-      end if
-      end
-!
-! -------- child routines --------
-!
-      subroutine cftfsub(n, a, ip, nw, w)
-      integer n, ip(0 : *), nw
-      real*8 a(0 : n - 1), w(0 : nw - 1)
-      if (n .gt. 8) then
-          if (n .gt. 32) then
-              call cftf1st(n, a, w(nw - n / 4))
-              if (n .gt. 512) then
-                  call cftrec4(n, a, nw, w)
-              else if (n .gt. 128) then
-                  call cftleaf(n, 1, a, nw, w)
-              else
-                  call cftfx41(n, a, nw, w)
-              end if
-              call bitrv2(n, ip, a)
-          else if (n .eq. 32) then
-              call cftf161(a, w(nw - 8))
-              call bitrv216(a)
-          else
-              call cftf081(a, w)
-              call bitrv208(a)
-          end if
-      else if (n .eq. 8) then
-          call cftf040(a)
-      else if (n .eq. 4) then
-          call cftx020(a)
-      end if
-      end
-!
-      subroutine cftbsub(n, a, ip, nw, w)
-      integer n, ip(0 : *), nw
-      real*8 a(0 : n - 1), w(0 : nw - 1)
-      if (n .gt. 8) then
-          if (n .gt. 32) then
-              call cftb1st(n, a, w(nw - n / 4))
-              if (n .gt. 512) then
-                  call cftrec4(n, a, nw, w)
-              else if (n .gt. 128) then
-                  call cftleaf(n, 1, a, nw, w)
-              else
-                  call cftfx41(n, a, nw, w)
-              end if
-              call bitrv2conj(n, ip, a)
-          else if (n .eq. 32) then
-              call cftf161(a, w(nw - 8))
-              call bitrv216neg(a)
-          else
-              call cftf081(a, w)
-              call bitrv208neg(a)
-          end if
-      else if (n .eq. 8) then
-          call cftb040(a)
-      else if (n .eq. 4) then
-          call cftx020(a)
-      end if
-      end
-!
-      subroutine bitrv2(n, ip, a)
-      integer n, ip(0 : *), j, j1, k, k1, l, m, nh, nm
-      real*8 a(0 : n - 1), xr, xi, yr, yi
-      m = 1
-      l = n / 4
-      do while (l .gt. 8)
-          m = m * 2
-          l = l / 4
-      end do
-      nh = n / 2
-      nm = 4 * m
-      if (l .eq. 8) then
-          do k = 0, m - 1
-              do j = 0, k - 1
-                  j1 = 4 * j + 2 * ip(m + k)
-                  k1 = 4 * k + 2 * ip(m + j)
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + 2 * nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 - nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + 2 * nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nh
-                  k1 = k1 + 2
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - 2 * nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 + nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - 2 * nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + 2
-                  k1 = k1 + nh
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + 2 * nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 - nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + 2 * nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nh
-                  k1 = k1 - 2
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - 2 * nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 + nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - 2 * nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-              end do
-              k1 = 4 * k + 2 * ip(m + k)
-              j1 = k1 + 2
-              k1 = k1 + nh
-              xr = a(j1)
-              xi = a(j1 + 1)
-              yr = a(k1)
-              yi = a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 + nm
-              k1 = k1 + 2 * nm
-              xr = a(j1)
-              xi = a(j1 + 1)
-              yr = a(k1)
-              yi = a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 + nm
-              k1 = k1 - nm
-              xr = a(j1)
-              xi = a(j1 + 1)
-              yr = a(k1)
-              yi = a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 - 2
-              k1 = k1 - nh
-              xr = a(j1)
-              xi = a(j1 + 1)
-              yr = a(k1)
-              yi = a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 + nh + 2
-              k1 = k1 + nh + 2
-              xr = a(j1)
-              xi = a(j1 + 1)
-              yr = a(k1)
-              yi = a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 - nh + nm
-              k1 = k1 + 2 * nm - 2
-              xr = a(j1)
-              xi = a(j1 + 1)
-              yr = a(k1)
-              yi = a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-          end do
-      else
-          do k = 0, m - 1
-              do j = 0, k - 1
-                  j1 = 4 * j + ip(m + k)
-                  k1 = 4 * k + ip(m + j)
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nh
-                  k1 = k1 + 2
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + 2
-                  k1 = k1 + nh
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nh
-                  k1 = k1 - 2
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - nm
-                  xr = a(j1)
-                  xi = a(j1 + 1)
-                  yr = a(k1)
-                  yi = a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-              end do
-              k1 = 4 * k + ip(m + k)
-              j1 = k1 + 2
-              k1 = k1 + nh
-              xr = a(j1)
-              xi = a(j1 + 1)
-              yr = a(k1)
-              yi = a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 + nm
-              k1 = k1 + nm
-              xr = a(j1)
-              xi = a(j1 + 1)
-              yr = a(k1)
-              yi = a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-          end do
-      end if
-      end
-!
-      subroutine bitrv2conj(n, ip, a)
-      integer n, ip(0 : *), j, j1, k, k1, l, m, nh, nm
-      real*8 a(0 : n - 1), xr, xi, yr, yi
-      m = 1
-      l = n / 4
-      do while (l .gt. 8)
-          m = m * 2
-          l = l / 4
-      end do
-      nh = n / 2
-      nm = 4 * m
-      if (l .eq. 8) then
-          do k = 0, m - 1
-              do j = 0, k - 1
-                  j1 = 4 * j + 2 * ip(m + k)
-                  k1 = 4 * k + 2 * ip(m + j)
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + 2 * nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 - nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + 2 * nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nh
-                  k1 = k1 + 2
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - 2 * nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 + nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - 2 * nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + 2
-                  k1 = k1 + nh
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + 2 * nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 - nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + 2 * nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nh
-                  k1 = k1 - 2
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - 2 * nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 + nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - 2 * nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-              end do
-              k1 = 4 * k + 2 * ip(m + k)
-              j1 = k1 + 2
-              k1 = k1 + nh
-              a(j1 - 1) = -a(j1 - 1)
-              xr = a(j1)
-              xi = -a(j1 + 1)
-              yr = a(k1)
-              yi = -a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              a(k1 + 3) = -a(k1 + 3)
-              j1 = j1 + nm
-              k1 = k1 + 2 * nm
-              xr = a(j1)
-              xi = -a(j1 + 1)
-              yr = a(k1)
-              yi = -a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 + nm
-              k1 = k1 - nm
-              xr = a(j1)
-              xi = -a(j1 + 1)
-              yr = a(k1)
-              yi = -a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 - 2
-              k1 = k1 - nh
-              xr = a(j1)
-              xi = -a(j1 + 1)
-              yr = a(k1)
-              yi = -a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 + nh + 2
-              k1 = k1 + nh + 2
-              xr = a(j1)
-              xi = -a(j1 + 1)
-              yr = a(k1)
-              yi = -a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              j1 = j1 - nh + nm
-              k1 = k1 + 2 * nm - 2
-              a(j1 - 1) = -a(j1 - 1)
-              xr = a(j1)
-              xi = -a(j1 + 1)
-              yr = a(k1)
-              yi = -a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              a(k1 + 3) = -a(k1 + 3)
-          end do
-      else
-          do k = 0, m - 1
-              do j = 0, k - 1
-                  j1 = 4 * j + ip(m + k)
-                  k1 = 4 * k + ip(m + j)
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nh
-                  k1 = k1 + 2
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + 2
-                  k1 = k1 + nh
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 + nm
-                  k1 = k1 + nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nh
-                  k1 = k1 - 2
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-                  j1 = j1 - nm
-                  k1 = k1 - nm
-                  xr = a(j1)
-                  xi = -a(j1 + 1)
-                  yr = a(k1)
-                  yi = -a(k1 + 1)
-                  a(j1) = yr
-                  a(j1 + 1) = yi
-                  a(k1) = xr
-                  a(k1 + 1) = xi
-              end do
-              k1 = 4 * k + ip(m + k)
-              j1 = k1 + 2
-              k1 = k1 + nh
-              a(j1 - 1) = -a(j1 - 1)
-              xr = a(j1)
-              xi = -a(j1 + 1)
-              yr = a(k1)
-              yi = -a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              a(k1 + 3) = -a(k1 + 3)
-              j1 = j1 + nm
-              k1 = k1 + nm
-              a(j1 - 1) = -a(j1 - 1)
-              xr = a(j1)
-              xi = -a(j1 + 1)
-              yr = a(k1)
-              yi = -a(k1 + 1)
-              a(j1) = yr
-              a(j1 + 1) = yi
-              a(k1) = xr
-              a(k1 + 1) = xi
-              a(k1 + 3) = -a(k1 + 3)
-          end do
-      end if
-      end
-!
-      subroutine bitrv216(a)
-      real*8 a(0 : 31), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
-      real*8 x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i
-      real*8 x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i
-      x1r = a(2)
-      x1i = a(3)
-      x2r = a(4)
-      x2i = a(5)
-      x3r = a(6)
-      x3i = a(7)
-      x4r = a(8)
-      x4i = a(9)
-      x5r = a(10)
-      x5i = a(11)
-      x7r = a(14)
-      x7i = a(15)
-      x8r = a(16)
-      x8i = a(17)
-      x10r = a(20)
-      x10i = a(21)
-      x11r = a(22)
-      x11i = a(23)
-      x12r = a(24)
-      x12i = a(25)
-      x13r = a(26)
-      x13i = a(27)
-      x14r = a(28)
-      x14i = a(29)
-      a(2) = x8r
-      a(3) = x8i
-      a(4) = x4r
-      a(5) = x4i
-      a(6) = x12r
-      a(7) = x12i
-      a(8) = x2r
-      a(9) = x2i
-      a(10) = x10r
-      a(11) = x10i
-      a(14) = x14r
-      a(15) = x14i
-      a(16) = x1r
-      a(17) = x1i
-      a(20) = x5r
-      a(21) = x5i
-      a(22) = x13r
-      a(23) = x13i
-      a(24) = x3r
-      a(25) = x3i
-      a(26) = x11r
-      a(27) = x11i
-      a(28) = x7r
-      a(29) = x7i
-      end
-!
-      subroutine bitrv216neg(a)
-      real*8 a(0 : 31), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
-      real*8 x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i
-      real*8 x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i
-      real*8 x13r, x13i, x14r, x14i, x15r, x15i
-      x1r = a(2)
-      x1i = a(3)
-      x2r = a(4)
-      x2i = a(5)
-      x3r = a(6)
-      x3i = a(7)
-      x4r = a(8)
-      x4i = a(9)
-      x5r = a(10)
-      x5i = a(11)
-      x6r = a(12)
-      x6i = a(13)
-      x7r = a(14)
-      x7i = a(15)
-      x8r = a(16)
-      x8i = a(17)
-      x9r = a(18)
-      x9i = a(19)
-      x10r = a(20)
-      x10i = a(21)
-      x11r = a(22)
-      x11i = a(23)
-      x12r = a(24)
-      x12i = a(25)
-      x13r = a(26)
-      x13i = a(27)
-      x14r = a(28)
-      x14i = a(29)
-      x15r = a(30)
-      x15i = a(31)
-      a(2) = x15r
-      a(3) = x15i
-      a(4) = x7r
-      a(5) = x7i
-      a(6) = x11r
-      a(7) = x11i
-      a(8) = x3r
-      a(9) = x3i
-      a(10) = x13r
-      a(11) = x13i
-      a(12) = x5r
-      a(13) = x5i
-      a(14) = x9r
-      a(15) = x9i
-      a(16) = x1r
-      a(17) = x1i
-      a(18) = x14r
-      a(19) = x14i
-      a(20) = x6r
-      a(21) = x6i
-      a(22) = x10r
-      a(23) = x10i
-      a(24) = x2r
-      a(25) = x2i
-      a(26) = x12r
-      a(27) = x12i
-      a(28) = x4r
-      a(29) = x4i
-      a(30) = x8r
-      a(31) = x8i
-      end
-!
-      subroutine bitrv208(a)
-      real*8 a(0 : 15), x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i
-      x1r = a(2)
-      x1i = a(3)
-      x3r = a(6)
-      x3i = a(7)
-      x4r = a(8)
-      x4i = a(9)
-      x6r = a(12)
-      x6i = a(13)
-      a(2) = x4r
-      a(3) = x4i
-      a(6) = x6r
-      a(7) = x6i
-      a(8) = x1r
-      a(9) = x1i
-      a(12) = x3r
-      a(13) = x3i
-      end
-!
-      subroutine bitrv208neg(a)
-      real*8 a(0 : 15), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
-      real*8 x5r, x5i, x6r, x6i, x7r, x7i
-      x1r = a(2)
-      x1i = a(3)
-      x2r = a(4)
-      x2i = a(5)
-      x3r = a(6)
-      x3i = a(7)
-      x4r = a(8)
-      x4i = a(9)
-      x5r = a(10)
-      x5i = a(11)
-      x6r = a(12)
-      x6i = a(13)
-      x7r = a(14)
-      x7i = a(15)
-      a(2) = x7r
-      a(3) = x7i
-      a(4) = x3r
-      a(5) = x3i
-      a(6) = x5r
-      a(7) = x5i
-      a(8) = x1r
-      a(9) = x1i
-      a(10) = x6r
-      a(11) = x6i
-      a(12) = x2r
-      a(13) = x2i
-      a(14) = x4r
-      a(15) = x4i
-      end
-!
-      subroutine cftf1st(n, a, w)
-      integer n, j, j0, j1, j2, j3, k, m, mh
-      real*8 a(0 : n - 1), w(0 : *)
-      real*8 wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i
-      real*8 wd1r, wd1i, wd3r, wd3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
-      mh = n / 8
-      m = 2 * mh
-      j1 = m
-      j2 = j1 + m
-      j3 = j2 + m
-      x0r = a(0) + a(j2)
-      x0i = a(1) + a(j2 + 1)
-      x1r = a(0) - a(j2)
-      x1i = a(1) - a(j2 + 1)
-      x2r = a(j1) + a(j3)
-      x2i = a(j1 + 1) + a(j3 + 1)
-      x3r = a(j1) - a(j3)
-      x3i = a(j1 + 1) - a(j3 + 1)
-      a(0) = x0r + x2r
-      a(1) = x0i + x2i
-      a(j1) = x0r - x2r
-      a(j1 + 1) = x0i - x2i
-      a(j2) = x1r - x3i
-      a(j2 + 1) = x1i + x3r
-      a(j3) = x1r + x3i
-      a(j3 + 1) = x1i - x3r
-      wn4r = w(1)
-      csc1 = w(2)
-      csc3 = w(3)
-      wd1r = 1
-      wd1i = 0
-      wd3r = 1
-      wd3i = 0
-      k = 0
-      do j = 2, mh - 6, 4
-          k = k + 4
-          wk1r = csc1 * (wd1r + w(k))
-          wk1i = csc1 * (wd1i + w(k + 1))
-          wk3r = csc3 * (wd3r + w(k + 2))
-          wk3i = csc3 * (wd3i + w(k + 3))
-          wd1r = w(k)
-          wd1i = w(k + 1)
-          wd3r = w(k + 2)
-          wd3i = w(k + 3)
-          j1 = j + m
-          j2 = j1 + m
-          j3 = j2 + m
-          x0r = a(j) + a(j2)
-          x0i = a(j + 1) + a(j2 + 1)
-          x1r = a(j) - a(j2)
-          x1i = a(j + 1) - a(j2 + 1)
-          y0r = a(j + 2) + a(j2 + 2)
-          y0i = a(j + 3) + a(j2 + 3)
-          y1r = a(j + 2) - a(j2 + 2)
-          y1i = a(j + 3) - a(j2 + 3)
-          x2r = a(j1) + a(j3)
-          x2i = a(j1 + 1) + a(j3 + 1)
-          x3r = a(j1) - a(j3)
-          x3i = a(j1 + 1) - a(j3 + 1)
-          y2r = a(j1 + 2) + a(j3 + 2)
-          y2i = a(j1 + 3) + a(j3 + 3)
-          y3r = a(j1 + 2) - a(j3 + 2)
-          y3i = a(j1 + 3) - a(j3 + 3)
-          a(j) = x0r + x2r
-          a(j + 1) = x0i + x2i
-          a(j + 2) = y0r + y2r
-          a(j + 3) = y0i + y2i
-          a(j1) = x0r - x2r
-          a(j1 + 1) = x0i - x2i
-          a(j1 + 2) = y0r - y2r
-          a(j1 + 3) = y0i - y2i
-          x0r = x1r - x3i
-          x0i = x1i + x3r
-          a(j2) = wk1r * x0r - wk1i * x0i
-          a(j2 + 1) = wk1r * x0i + wk1i * x0r
-          x0r = y1r - y3i
-          x0i = y1i + y3r
-          a(j2 + 2) = wd1r * x0r - wd1i * x0i
-          a(j2 + 3) = wd1r * x0i + wd1i * x0r
-          x0r = x1r + x3i
-          x0i = x1i - x3r
-          a(j3) = wk3r * x0r + wk3i * x0i
-          a(j3 + 1) = wk3r * x0i - wk3i * x0r
-          x0r = y1r + y3i
-          x0i = y1i - y3r
-          a(j3 + 2) = wd3r * x0r + wd3i * x0i
-          a(j3 + 3) = wd3r * x0i - wd3i * x0r
-          j0 = m - j
-          j1 = j0 + m
-          j2 = j1 + m
-          j3 = j2 + m
-          x0r = a(j0) + a(j2)
-          x0i = a(j0 + 1) + a(j2 + 1)
-          x1r = a(j0) - a(j2)
-          x1i = a(j0 + 1) - a(j2 + 1)
-          y0r = a(j0 - 2) + a(j2 - 2)
-          y0i = a(j0 - 1) + a(j2 - 1)
-          y1r = a(j0 - 2) - a(j2 - 2)
-          y1i = a(j0 - 1) - a(j2 - 1)
-          x2r = a(j1) + a(j3)
-          x2i = a(j1 + 1) + a(j3 + 1)
-          x3r = a(j1) - a(j3)
-          x3i = a(j1 + 1) - a(j3 + 1)
-          y2r = a(j1 - 2) + a(j3 - 2)
-          y2i = a(j1 - 1) + a(j3 - 1)
-          y3r = a(j1 - 2) - a(j3 - 2)
-          y3i = a(j1 - 1) - a(j3 - 1)
-          a(j0) = x0r + x2r
-          a(j0 + 1) = x0i + x2i
-          a(j0 - 2) = y0r + y2r
-          a(j0 - 1) = y0i + y2i
-          a(j1) = x0r - x2r
-          a(j1 + 1) = x0i - x2i
-          a(j1 - 2) = y0r - y2r
-          a(j1 - 1) = y0i - y2i
-          x0r = x1r - x3i
-          x0i = x1i + x3r
-          a(j2) = wk1i * x0r - wk1r * x0i
-          a(j2 + 1) = wk1i * x0i + wk1r * x0r
-          x0r = y1r - y3i
-          x0i = y1i + y3r
-          a(j2 - 2) = wd1i * x0r - wd1r * x0i
-          a(j2 - 1) = wd1i * x0i + wd1r * x0r
-          x0r = x1r + x3i
-          x0i = x1i - x3r
-          a(j3) = wk3i * x0r + wk3r * x0i
-          a(j3 + 1) = wk3i * x0i - wk3r * x0r
-          x0r = y1r + y3i
-          x0i = y1i - y3r
-          a(j3 - 2) = wd3i * x0r + wd3r * x0i
-          a(j3 - 1) = wd3i * x0i - wd3r * x0r
-      end do
-      wk1r = csc1 * (wd1r + wn4r)
-      wk1i = csc1 * (wd1i + wn4r)
-      wk3r = csc3 * (wd3r - wn4r)
-      wk3i = csc3 * (wd3i - wn4r)
-      j0 = mh
-      j1 = j0 + m
-      j2 = j1 + m
-      j3 = j2 + m
-      x0r = a(j0 - 2) + a(j2 - 2)
-      x0i = a(j0 - 1) + a(j2 - 1)
-      x1r = a(j0 - 2) - a(j2 - 2)
-      x1i = a(j0 - 1) - a(j2 - 1)
-      x2r = a(j1 - 2) + a(j3 - 2)
-      x2i = a(j1 - 1) + a(j3 - 1)
-      x3r = a(j1 - 2) - a(j3 - 2)
-      x3i = a(j1 - 1) - a(j3 - 1)
-      a(j0 - 2) = x0r + x2r
-      a(j0 - 1) = x0i + x2i
-      a(j1 - 2) = x0r - x2r
-      a(j1 - 1) = x0i - x2i
-      x0r = x1r - x3i
-      x0i = x1i + x3r
-      a(j2 - 2) = wk1r * x0r - wk1i * x0i
-      a(j2 - 1) = wk1r * x0i + wk1i * x0r
-      x0r = x1r + x3i
-      x0i = x1i - x3r
-      a(j3 - 2) = wk3r * x0r + wk3i * x0i
-      a(j3 - 1) = wk3r * x0i - wk3i * x0r
-      x0r = a(j0) + a(j2)
-      x0i = a(j0 + 1) + a(j2 + 1)
-      x1r = a(j0) - a(j2)
-      x1i = a(j0 + 1) - a(j2 + 1)
-      x2r = a(j1) + a(j3)
-      x2i = a(j1 + 1) + a(j3 + 1)
-      x3r = a(j1) - a(j3)
-      x3i = a(j1 + 1) - a(j3 + 1)
-      a(j0) = x0r + x2r
-      a(j0 + 1) = x0i + x2i
-      a(j1) = x0r - x2r
-      a(j1 + 1) = x0i - x2i
-      x0r = x1r - x3i
-      x0i = x1i + x3r
-      a(j2) = wn4r * (x0r - x0i)
-      a(j2 + 1) = wn4r * (x0i + x0r)
-      x0r = x1r + x3i
-      x0i = x1i - x3r
-      a(j3) = -wn4r * (x0r + x0i)
-      a(j3 + 1) = -wn4r * (x0i - x0r)
-      x0r = a(j0 + 2) + a(j2 + 2)
-      x0i = a(j0 + 3) + a(j2 + 3)
-      x1r = a(j0 + 2) - a(j2 + 2)
-      x1i = a(j0 + 3) - a(j2 + 3)
-      x2r = a(j1 + 2) + a(j3 + 2)
-      x2i = a(j1 + 3) + a(j3 + 3)
-      x3r = a(j1 + 2) - a(j3 + 2)
-      x3i = a(j1 + 3) - a(j3 + 3)
-      a(j0 + 2) = x0r + x2r
-      a(j0 + 3) = x0i + x2i
-      a(j1 + 2) = x0r - x2r
-      a(j1 + 3) = x0i - x2i
-      x0r = x1r - x3i
-      x0i = x1i + x3r
-      a(j2 + 2) = wk1i * x0r - wk1r * x0i
-      a(j2 + 3) = wk1i * x0i + wk1r * x0r
-      x0r = x1r + x3i
-      x0i = x1i - x3r
-      a(j3 + 2) = wk3i * x0r + wk3r * x0i
-      a(j3 + 3) = wk3i * x0i - wk3r * x0r
-      end
-!
-      subroutine cftb1st(n, a, w)
-      integer n, j, j0, j1, j2, j3, k, m, mh
-      real*8 a(0 : n - 1), w(0 : *)
-      real*8 wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i
-      real*8 wd1r, wd1i, wd3r, wd3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
-      mh = n / 8
-      m = 2 * mh
-      j1 = m
-      j2 = j1 + m
-      j3 = j2 + m
-      x0r = a(0) + a(j2)
-      x0i = -a(1) - a(j2 + 1)
-      x1r = a(0) - a(j2)
-      x1i = -a(1) + a(j2 + 1)
-      x2r = a(j1) + a(j3)
-      x2i = a(j1 + 1) + a(j3 + 1)
-      x3r = a(j1) - a(j3)
-      x3i = a(j1 + 1) - a(j3 + 1)
-      a(0) = x0r + x2r
-      a(1) = x0i - x2i
-      a(j1) = x0r - x2r
-      a(j1 + 1) = x0i + x2i
-      a(j2) = x1r + x3i
-      a(j2 + 1) = x1i + x3r
-      a(j3) = x1r - x3i
-      a(j3 + 1) = x1i - x3r
-      wn4r = w(1)
-      csc1 = w(2)
-      csc3 = w(3)
-      wd1r = 1
-      wd1i = 0
-      wd3r = 1
-      wd3i = 0
-      k = 0
-      do j = 2, mh - 6, 4
-          k = k + 4
-          wk1r = csc1 * (wd1r + w(k))
-          wk1i = csc1 * (wd1i + w(k + 1))
-          wk3r = csc3 * (wd3r + w(k + 2))
-          wk3i = csc3 * (wd3i + w(k + 3))
-          wd1r = w(k)
-          wd1i = w(k + 1)
-          wd3r = w(k + 2)
-          wd3i = w(k + 3)
-          j1 = j + m
-          j2 = j1 + m
-          j3 = j2 + m
-          x0r = a(j) + a(j2)
-          x0i = -a(j + 1) - a(j2 + 1)
-          x1r = a(j) - a(j2)
-          x1i = -a(j + 1) + a(j2 + 1)
-          y0r = a(j + 2) + a(j2 + 2)
-          y0i = -a(j + 3) - a(j2 + 3)
-          y1r = a(j + 2) - a(j2 + 2)
-          y1i = -a(j + 3) + a(j2 + 3)
-          x2r = a(j1) + a(j3)
-          x2i = a(j1 + 1) + a(j3 + 1)
-          x3r = a(j1) - a(j3)
-          x3i = a(j1 + 1) - a(j3 + 1)
-          y2r = a(j1 + 2) + a(j3 + 2)
-          y2i = a(j1 + 3) + a(j3 + 3)
-          y3r = a(j1 + 2) - a(j3 + 2)
-          y3i = a(j1 + 3) - a(j3 + 3)
-          a(j) = x0r + x2r
-          a(j + 1) = x0i - x2i
-          a(j + 2) = y0r + y2r
-          a(j + 3) = y0i - y2i
-          a(j1) = x0r - x2r
-          a(j1 + 1) = x0i + x2i
-          a(j1 + 2) = y0r - y2r
-          a(j1 + 3) = y0i + y2i
-          x0r = x1r + x3i
-          x0i = x1i + x3r
-          a(j2) = wk1r * x0r - wk1i * x0i
-          a(j2 + 1) = wk1r * x0i + wk1i * x0r
-          x0r = y1r + y3i
-          x0i = y1i + y3r
-          a(j2 + 2) = wd1r * x0r - wd1i * x0i
-          a(j2 + 3) = wd1r * x0i + wd1i * x0r
-          x0r = x1r - x3i
-          x0i = x1i - x3r
-          a(j3) = wk3r * x0r + wk3i * x0i
-          a(j3 + 1) = wk3r * x0i - wk3i * x0r
-          x0r = y1r - y3i
-          x0i = y1i - y3r
-          a(j3 + 2) = wd3r * x0r + wd3i * x0i
-          a(j3 + 3) = wd3r * x0i - wd3i * x0r
-          j0 = m - j
-          j1 = j0 + m
-          j2 = j1 + m
-          j3 = j2 + m
-          x0r = a(j0) + a(j2)
-          x0i = -a(j0 + 1) - a(j2 + 1)
-          x1r = a(j0) - a(j2)
-          x1i = -a(j0 + 1) + a(j2 + 1)
-          y0r = a(j0 - 2) + a(j2 - 2)
-          y0i = -a(j0 - 1) - a(j2 - 1)
-          y1r = a(j0 - 2) - a(j2 - 2)
-          y1i = -a(j0 - 1) + a(j2 - 1)
-          x2r = a(j1) + a(j3)
-          x2i = a(j1 + 1) + a(j3 + 1)
-          x3r = a(j1) - a(j3)
-          x3i = a(j1 + 1) - a(j3 + 1)
-          y2r = a(j1 - 2) + a(j3 - 2)
-          y2i = a(j1 - 1) + a(j3 - 1)
-          y3r = a(j1 - 2) - a(j3 - 2)
-          y3i = a(j1 - 1) - a(j3 - 1)
-          a(j0) = x0r + x2r
-          a(j0 + 1) = x0i - x2i
-          a(j0 - 2) = y0r + y2r
-          a(j0 - 1) = y0i - y2i
-          a(j1) = x0r - x2r
-          a(j1 + 1) = x0i + x2i
-          a(j1 - 2) = y0r - y2r
-          a(j1 - 1) = y0i + y2i
-          x0r = x1r + x3i
-          x0i = x1i + x3r
-          a(j2) = wk1i * x0r - wk1r * x0i
-          a(j2 + 1) = wk1i * x0i + wk1r * x0r
-          x0r = y1r + y3i
-          x0i = y1i + y3r
-          a(j2 - 2) = wd1i * x0r - wd1r * x0i
-          a(j2 - 1) = wd1i * x0i + wd1r * x0r
-          x0r = x1r - x3i
-          x0i = x1i - x3r
-          a(j3) = wk3i * x0r + wk3r * x0i
-          a(j3 + 1) = wk3i * x0i - wk3r * x0r
-          x0r = y1r - y3i
-          x0i = y1i - y3r
-          a(j3 - 2) = wd3i * x0r + wd3r * x0i
-          a(j3 - 1) = wd3i * x0i - wd3r * x0r
-      end do
-      wk1r = csc1 * (wd1r + wn4r)
-      wk1i = csc1 * (wd1i + wn4r)
-      wk3r = csc3 * (wd3r - wn4r)
-      wk3i = csc3 * (wd3i - wn4r)
-      j0 = mh
-      j1 = j0 + m
-      j2 = j1 + m
-      j3 = j2 + m
-      x0r = a(j0 - 2) + a(j2 - 2)
-      x0i = -a(j0 - 1) - a(j2 - 1)
-      x1r = a(j0 - 2) - a(j2 - 2)
-      x1i = -a(j0 - 1) + a(j2 - 1)
-      x2r = a(j1 - 2) + a(j3 - 2)
-      x2i = a(j1 - 1) + a(j3 - 1)
-      x3r = a(j1 - 2) - a(j3 - 2)
-      x3i = a(j1 - 1) - a(j3 - 1)
-      a(j0 - 2) = x0r + x2r
-      a(j0 - 1) = x0i - x2i
-      a(j1 - 2) = x0r - x2r
-      a(j1 - 1) = x0i + x2i
-      x0r = x1r + x3i
-      x0i = x1i + x3r
-      a(j2 - 2) = wk1r * x0r - wk1i * x0i
-      a(j2 - 1) = wk1r * x0i + wk1i * x0r
-      x0r = x1r - x3i
-      x0i = x1i - x3r
-      a(j3 - 2) = wk3r * x0r + wk3i * x0i
-      a(j3 - 1) = wk3r * x0i - wk3i * x0r
-      x0r = a(j0) + a(j2)
-      x0i = -a(j0 + 1) - a(j2 + 1)
-      x1r = a(j0) - a(j2)
-      x1i = -a(j0 + 1) + a(j2 + 1)
-      x2r = a(j1) + a(j3)
-      x2i = a(j1 + 1) + a(j3 + 1)
-      x3r = a(j1) - a(j3)
-      x3i = a(j1 + 1) - a(j3 + 1)
-      a(j0) = x0r + x2r
-      a(j0 + 1) = x0i - x2i
-      a(j1) = x0r - x2r
-      a(j1 + 1) = x0i + x2i
-      x0r = x1r + x3i
-      x0i = x1i + x3r
-      a(j2) = wn4r * (x0r - x0i)
-      a(j2 + 1) = wn4r * (x0i + x0r)
-      x0r = x1r - x3i
-      x0i = x1i - x3r
-      a(j3) = -wn4r * (x0r + x0i)
-      a(j3 + 1) = -wn4r * (x0i - x0r)
-      x0r = a(j0 + 2) + a(j2 + 2)
-      x0i = -a(j0 + 3) - a(j2 + 3)
-      x1r = a(j0 + 2) - a(j2 + 2)
-      x1i = -a(j0 + 3) + a(j2 + 3)
-      x2r = a(j1 + 2) + a(j3 + 2)
-      x2i = a(j1 + 3) + a(j3 + 3)
-      x3r = a(j1 + 2) - a(j3 + 2)
-      x3i = a(j1 + 3) - a(j3 + 3)
-      a(j0 + 2) = x0r + x2r
-      a(j0 + 3) = x0i - x2i
-      a(j1 + 2) = x0r - x2r
-      a(j1 + 3) = x0i + x2i
-      x0r = x1r + x3i
-      x0i = x1i + x3r
-      a(j2 + 2) = wk1i * x0r - wk1r * x0i
-      a(j2 + 3) = wk1i * x0i + wk1r * x0r
-      x0r = x1r - x3i
-      x0i = x1i - x3r
-      a(j3 + 2) = wk3i * x0r + wk3r * x0i
-      a(j3 + 3) = wk3i * x0i - wk3r * x0r
-      end
-!
-      subroutine cftrec4(n, a, nw, w)
-      integer n, nw, cfttree, isplt, j, k, m
-      real*8 a(0 : n - 1), w(0 : nw - 1)
-      m = n
-      do while (m .gt. 512)
-          m = m / 4
-          call cftmdl1(m, a(n - m), w(nw - m / 2))
-      end do
-      call cftleaf(m, 1, a(n - m), nw, w)
-      k = 0
-      do j = n - m, m, -m
-          k = k + 1
-          isplt = cfttree(m, j, k, a, nw, w)
-          call cftleaf(m, isplt, a(j - m), nw, w)
-      end do
-      end
-!
-      integer function cfttree(n, j, k, a, nw, w)
-      integer n, j, k, nw, i, isplt, m
-      real*8 a(0 : j - 1), w(0 : nw - 1)
-      if (mod(k, 4) .ne. 0) then
-          isplt = mod(k, 2)
-          if (isplt .ne. 0) then
-              call cftmdl1(n, a(j - n), w(nw - n / 2))
-          else
-              call cftmdl2(n, a(j - n), w(nw - n))
-          end if
-      else
-          m = n
-          i = k
-          do while (mod(i, 4) .eq. 0)
-              m = m * 4
-              i = i / 4
-          end do
-          isplt = mod(i, 2)
-          if (isplt .ne. 0) then
-              do while (m .gt. 128)
-                  call cftmdl1(m, a(j - m), w(nw - m / 2))
-                  m = m / 4
-              end do
-          else
-              do while (m .gt. 128)
-                  call cftmdl2(m, a(j - m), w(nw - m))
-                  m = m / 4
-              end do
-          end if
-      end if
-      cfttree = isplt
-      end
-!
-      subroutine cftleaf(n, isplt, a, nw, w)
-      integer n, isplt, nw
-      real*8 a(0 : n - 1), w(0 : nw - 1)
-      if (n .eq. 512) then
-          call cftmdl1(128, a, w(nw - 64))
-          call cftf161(a, w(nw - 8))
-          call cftf162(a(32), w(nw - 32))
-          call cftf161(a(64), w(nw - 8))
-          call cftf161(a(96), w(nw - 8))
-          call cftmdl2(128, a(128), w(nw - 128))
-          call cftf161(a(128), w(nw - 8))
-          call cftf162(a(160), w(nw - 32))
-          call cftf161(a(192), w(nw - 8))
-          call cftf162(a(224), w(nw - 32))
-          call cftmdl1(128, a(256), w(nw - 64))
-          call cftf161(a(256), w(nw - 8))
-          call cftf162(a(288), w(nw - 32))
-          call cftf161(a(320), w(nw - 8))
-          call cftf161(a(352), w(nw - 8))
-          if (isplt .ne. 0) then
-              call cftmdl1(128, a(384), w(nw - 64))
-              call cftf161(a(480), w(nw - 8))
-          else
-              call cftmdl2(128, a(384), w(nw - 128))
-              call cftf162(a(480), w(nw - 32))
-          end if
-          call cftf161(a(384), w(nw - 8))
-          call cftf162(a(416), w(nw - 32))
-          call cftf161(a(448), w(nw - 8))
-      else
-          call cftmdl1(64, a, w(nw - 32))
-          call cftf081(a, w(nw - 8))
-          call cftf082(a(16), w(nw - 8))
-          call cftf081(a(32), w(nw - 8))
-          call cftf081(a(48), w(nw - 8))
-          call cftmdl2(64, a(64), w(nw - 64))
-          call cftf081(a(64), w(nw - 8))
-          call cftf082(a(80), w(nw - 8))
-          call cftf081(a(96), w(nw - 8))
-          call cftf082(a(112), w(nw - 8))
-          call cftmdl1(64, a(128), w(nw - 32))
-          call cftf081(a(128), w(nw - 8))
-          call cftf082(a(144), w(nw - 8))
-          call cftf081(a(160), w(nw - 8))
-          call cftf081(a(176), w(nw - 8))
-          if (isplt .ne. 0) then
-              call cftmdl1(64, a(192), w(nw - 32))
-              call cftf081(a(240), w(nw - 8))
-          else
-              call cftmdl2(64, a(192), w(nw - 64))
-              call cftf082(a(240), w(nw - 8))
-          end if
-          call cftf081(a(192), w(nw - 8))
-          call cftf082(a(208), w(nw - 8))
-          call cftf081(a(224), w(nw - 8))
-      end if
-      end
-!
-      subroutine cftmdl1(n, a, w)
-      integer n, j, j0, j1, j2, j3, k, m, mh
-      real*8 a(0 : n - 1), w(0 : *)
-      real*8 wn4r, wk1r, wk1i, wk3r, wk3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      mh = n / 8
-      m = 2 * mh
-      j1 = m
-      j2 = j1 + m
-      j3 = j2 + m
-      x0r = a(0) + a(j2)
-      x0i = a(1) + a(j2 + 1)
-      x1r = a(0) - a(j2)
-      x1i = a(1) - a(j2 + 1)
-      x2r = a(j1) + a(j3)
-      x2i = a(j1 + 1) + a(j3 + 1)
-      x3r = a(j1) - a(j3)
-      x3i = a(j1 + 1) - a(j3 + 1)
-      a(0) = x0r + x2r
-      a(1) = x0i + x2i
-      a(j1) = x0r - x2r
-      a(j1 + 1) = x0i - x2i
-      a(j2) = x1r - x3i
-      a(j2 + 1) = x1i + x3r
-      a(j3) = x1r + x3i
-      a(j3 + 1) = x1i - x3r
-      wn4r = w(1)
-      k = 0
-      do j = 2, mh - 2, 2
-          k = k + 4
-          wk1r = w(k)
-          wk1i = w(k + 1)
-          wk3r = w(k + 2)
-          wk3i = w(k + 3)
-          j1 = j + m
-          j2 = j1 + m
-          j3 = j2 + m
-          x0r = a(j) + a(j2)
-          x0i = a(j + 1) + a(j2 + 1)
-          x1r = a(j) - a(j2)
-          x1i = a(j + 1) - a(j2 + 1)
-          x2r = a(j1) + a(j3)
-          x2i = a(j1 + 1) + a(j3 + 1)
-          x3r = a(j1) - a(j3)
-          x3i = a(j1 + 1) - a(j3 + 1)
-          a(j) = x0r + x2r
-          a(j + 1) = x0i + x2i
-          a(j1) = x0r - x2r
-          a(j1 + 1) = x0i - x2i
-          x0r = x1r - x3i
-          x0i = x1i + x3r
-          a(j2) = wk1r * x0r - wk1i * x0i
-          a(j2 + 1) = wk1r * x0i + wk1i * x0r
-          x0r = x1r + x3i
-          x0i = x1i - x3r
-          a(j3) = wk3r * x0r + wk3i * x0i
-          a(j3 + 1) = wk3r * x0i - wk3i * x0r
-          j0 = m - j
-          j1 = j0 + m
-          j2 = j1 + m
-          j3 = j2 + m
-          x0r = a(j0) + a(j2)
-          x0i = a(j0 + 1) + a(j2 + 1)
-          x1r = a(j0) - a(j2)
-          x1i = a(j0 + 1) - a(j2 + 1)
-          x2r = a(j1) + a(j3)
-          x2i = a(j1 + 1) + a(j3 + 1)
-          x3r = a(j1) - a(j3)
-          x3i = a(j1 + 1) - a(j3 + 1)
-          a(j0) = x0r + x2r
-          a(j0 + 1) = x0i + x2i
-          a(j1) = x0r - x2r
-          a(j1 + 1) = x0i - x2i
-          x0r = x1r - x3i
-          x0i = x1i + x3r
-          a(j2) = wk1i * x0r - wk1r * x0i
-          a(j2 + 1) = wk1i * x0i + wk1r * x0r
-          x0r = x1r + x3i
-          x0i = x1i - x3r
-          a(j3) = wk3i * x0r + wk3r * x0i
-          a(j3 + 1) = wk3i * x0i - wk3r * x0r
-      end do
-      j0 = mh
-      j1 = j0 + m
-      j2 = j1 + m
-      j3 = j2 + m
-      x0r = a(j0) + a(j2)
-      x0i = a(j0 + 1) + a(j2 + 1)
-      x1r = a(j0) - a(j2)
-      x1i = a(j0 + 1) - a(j2 + 1)
-      x2r = a(j1) + a(j3)
-      x2i = a(j1 + 1) + a(j3 + 1)
-      x3r = a(j1) - a(j3)
-      x3i = a(j1 + 1) - a(j3 + 1)
-      a(j0) = x0r + x2r
-      a(j0 + 1) = x0i + x2i
-      a(j1) = x0r - x2r
-      a(j1 + 1) = x0i - x2i
-      x0r = x1r - x3i
-      x0i = x1i + x3r
-      a(j2) = wn4r * (x0r - x0i)
-      a(j2 + 1) = wn4r * (x0i + x0r)
-      x0r = x1r + x3i
-      x0i = x1i - x3r
-      a(j3) = -wn4r * (x0r + x0i)
-      a(j3 + 1) = -wn4r * (x0i - x0r)
-      end
-!
-      subroutine cftmdl2(n, a, w)
-      integer n, j, j0, j1, j2, j3, k, kr, m, mh
-      real*8 a(0 : n - 1), w(0 : *)
-      real*8 wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      real*8 y0r, y0i, y2r, y2i
-      mh = n / 8
-      m = 2 * mh
-      wn4r = w(1)
-      j1 = m
-      j2 = j1 + m
-      j3 = j2 + m
-      x0r = a(0) - a(j2 + 1)
-      x0i = a(1) + a(j2)
-      x1r = a(0) + a(j2 + 1)
-      x1i = a(1) - a(j2)
-      x2r = a(j1) - a(j3 + 1)
-      x2i = a(j1 + 1) + a(j3)
-      x3r = a(j1) + a(j3 + 1)
-      x3i = a(j1 + 1) - a(j3)
-      y0r = wn4r * (x2r - x2i)
-      y0i = wn4r * (x2i + x2r)
-      a(0) = x0r + y0r
-      a(1) = x0i + y0i
-      a(j1) = x0r - y0r
-      a(j1 + 1) = x0i - y0i
-      y0r = wn4r * (x3r - x3i)
-      y0i = wn4r * (x3i + x3r)
-      a(j2) = x1r - y0i
-      a(j2 + 1) = x1i + y0r
-      a(j3) = x1r + y0i
-      a(j3 + 1) = x1i - y0r
-      k = 0
-      kr = 2 * m
-      do j = 2, mh - 2, 2
-          k = k + 4
-          wk1r = w(k)
-          wk1i = w(k + 1)
-          wk3r = w(k + 2)
-          wk3i = w(k + 3)
-          kr = kr - 4
-          wd1i = w(kr)
-          wd1r = w(kr + 1)
-          wd3i = w(kr + 2)
-          wd3r = w(kr + 3)
-          j1 = j + m
-          j2 = j1 + m
-          j3 = j2 + m
-          x0r = a(j) - a(j2 + 1)
-          x0i = a(j + 1) + a(j2)
-          x1r = a(j) + a(j2 + 1)
-          x1i = a(j + 1) - a(j2)
-          x2r = a(j1) - a(j3 + 1)
-          x2i = a(j1 + 1) + a(j3)
-          x3r = a(j1) + a(j3 + 1)
-          x3i = a(j1 + 1) - a(j3)
-          y0r = wk1r * x0r - wk1i * x0i
-          y0i = wk1r * x0i + wk1i * x0r
-          y2r = wd1r * x2r - wd1i * x2i
-          y2i = wd1r * x2i + wd1i * x2r
-          a(j) = y0r + y2r
-          a(j + 1) = y0i + y2i
-          a(j1) = y0r - y2r
-          a(j1 + 1) = y0i - y2i
-          y0r = wk3r * x1r + wk3i * x1i
-          y0i = wk3r * x1i - wk3i * x1r
-          y2r = wd3r * x3r + wd3i * x3i
-          y2i = wd3r * x3i - wd3i * x3r
-          a(j2) = y0r + y2r
-          a(j2 + 1) = y0i + y2i
-          a(j3) = y0r - y2r
-          a(j3 + 1) = y0i - y2i
-          j0 = m - j
-          j1 = j0 + m
-          j2 = j1 + m
-          j3 = j2 + m
-          x0r = a(j0) - a(j2 + 1)
-          x0i = a(j0 + 1) + a(j2)
-          x1r = a(j0) + a(j2 + 1)
-          x1i = a(j0 + 1) - a(j2)
-          x2r = a(j1) - a(j3 + 1)
-          x2i = a(j1 + 1) + a(j3)
-          x3r = a(j1) + a(j3 + 1)
-          x3i = a(j1 + 1) - a(j3)
-          y0r = wd1i * x0r - wd1r * x0i
-          y0i = wd1i * x0i + wd1r * x0r
-          y2r = wk1i * x2r - wk1r * x2i
-          y2i = wk1i * x2i + wk1r * x2r
-          a(j0) = y0r + y2r
-          a(j0 + 1) = y0i + y2i
-          a(j1) = y0r - y2r
-          a(j1 + 1) = y0i - y2i
-          y0r = wd3i * x1r + wd3r * x1i
-          y0i = wd3i * x1i - wd3r * x1r
-          y2r = wk3i * x3r + wk3r * x3i
-          y2i = wk3i * x3i - wk3r * x3r
-          a(j2) = y0r + y2r
-          a(j2 + 1) = y0i + y2i
-          a(j3) = y0r - y2r
-          a(j3 + 1) = y0i - y2i
-      end do
-      wk1r = w(m)
-      wk1i = w(m + 1)
-      j0 = mh
-      j1 = j0 + m
-      j2 = j1 + m
-      j3 = j2 + m
-      x0r = a(j0) - a(j2 + 1)
-      x0i = a(j0 + 1) + a(j2)
-      x1r = a(j0) + a(j2 + 1)
-      x1i = a(j0 + 1) - a(j2)
-      x2r = a(j1) - a(j3 + 1)
-      x2i = a(j1 + 1) + a(j3)
-      x3r = a(j1) + a(j3 + 1)
-      x3i = a(j1 + 1) - a(j3)
-      y0r = wk1r * x0r - wk1i * x0i
-      y0i = wk1r * x0i + wk1i * x0r
-      y2r = wk1i * x2r - wk1r * x2i
-      y2i = wk1i * x2i + wk1r * x2r
-      a(j0) = y0r + y2r
-      a(j0 + 1) = y0i + y2i
-      a(j1) = y0r - y2r
-      a(j1 + 1) = y0i - y2i
-      y0r = wk1i * x1r - wk1r * x1i
-      y0i = wk1i * x1i + wk1r * x1r
-      y2r = wk1r * x3r - wk1i * x3i
-      y2i = wk1r * x3i + wk1i * x3r
-      a(j2) = y0r - y2r
-      a(j2 + 1) = y0i - y2i
-      a(j3) = y0r + y2r
-      a(j3 + 1) = y0i + y2i
-      end
-!
-      subroutine cftfx41(n, a, nw, w)
-      integer n, nw
-      real*8 a(0 : n - 1), w(0 : nw - 1)
-      if (n .eq. 128) then
-          call cftf161(a, w(nw - 8))
-          call cftf162(a(32), w(nw - 32))
-          call cftf161(a(64), w(nw - 8))
-          call cftf161(a(96), w(nw - 8))
-      else
-          call cftf081(a, w(nw - 8))
-          call cftf082(a(16), w(nw - 8))
-          call cftf081(a(32), w(nw - 8))
-          call cftf081(a(48), w(nw - 8))
-      end if
-      end
-!
-      subroutine cftf161(a, w)
-      real*8 a(0 : 31), w(0 : *), wn4r, wk1r, wk1i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
-      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
-      real*8 y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i
-      real*8 y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i
-      wn4r = w(1)
-      wk1r = w(2)
-      wk1i = w(3)
-      x0r = a(0) + a(16)
-      x0i = a(1) + a(17)
-      x1r = a(0) - a(16)
-      x1i = a(1) - a(17)
-      x2r = a(8) + a(24)
-      x2i = a(9) + a(25)
-      x3r = a(8) - a(24)
-      x3i = a(9) - a(25)
-      y0r = x0r + x2r
-      y0i = x0i + x2i
-      y4r = x0r - x2r
-      y4i = x0i - x2i
-      y8r = x1r - x3i
-      y8i = x1i + x3r
-      y12r = x1r + x3i
-      y12i = x1i - x3r
-      x0r = a(2) + a(18)
-      x0i = a(3) + a(19)
-      x1r = a(2) - a(18)
-      x1i = a(3) - a(19)
-      x2r = a(10) + a(26)
-      x2i = a(11) + a(27)
-      x3r = a(10) - a(26)
-      x3i = a(11) - a(27)
-      y1r = x0r + x2r
-      y1i = x0i + x2i
-      y5r = x0r - x2r
-      y5i = x0i - x2i
-      x0r = x1r - x3i
-      x0i = x1i + x3r
-      y9r = wk1r * x0r - wk1i * x0i
-      y9i = wk1r * x0i + wk1i * x0r
-      x0r = x1r + x3i
-      x0i = x1i - x3r
-      y13r = wk1i * x0r - wk1r * x0i
-      y13i = wk1i * x0i + wk1r * x0r
-      x0r = a(4) + a(20)
-      x0i = a(5) + a(21)
-      x1r = a(4) - a(20)
-      x1i = a(5) - a(21)
-      x2r = a(12) + a(28)
-      x2i = a(13) + a(29)
-      x3r = a(12) - a(28)
-      x3i = a(13) - a(29)
-      y2r = x0r + x2r
-      y2i = x0i + x2i
-      y6r = x0r - x2r
-      y6i = x0i - x2i
-      x0r = x1r - x3i
-      x0i = x1i + x3r
-      y10r = wn4r * (x0r - x0i)
-      y10i = wn4r * (x0i + x0r)
-      x0r = x1r + x3i
-      x0i = x1i - x3r
-      y14r = wn4r * (x0r + x0i)
-      y14i = wn4r * (x0i - x0r)
-      x0r = a(6) + a(22)
-      x0i = a(7) + a(23)
-      x1r = a(6) - a(22)
-      x1i = a(7) - a(23)
-      x2r = a(14) + a(30)
-      x2i = a(15) + a(31)
-      x3r = a(14) - a(30)
-      x3i = a(15) - a(31)
-      y3r = x0r + x2r
-      y3i = x0i + x2i
-      y7r = x0r - x2r
-      y7i = x0i - x2i
-      x0r = x1r - x3i
-      x0i = x1i + x3r
-      y11r = wk1i * x0r - wk1r * x0i
-      y11i = wk1i * x0i + wk1r * x0r
-      x0r = x1r + x3i
-      x0i = x1i - x3r
-      y15r = wk1r * x0r - wk1i * x0i
-      y15i = wk1r * x0i + wk1i * x0r
-      x0r = y12r - y14r
-      x0i = y12i - y14i
-      x1r = y12r + y14r
-      x1i = y12i + y14i
-      x2r = y13r - y15r
-      x2i = y13i - y15i
-      x3r = y13r + y15r
-      x3i = y13i + y15i
-      a(24) = x0r + x2r
-      a(25) = x0i + x2i
-      a(26) = x0r - x2r
-      a(27) = x0i - x2i
-      a(28) = x1r - x3i
-      a(29) = x1i + x3r
-      a(30) = x1r + x3i
-      a(31) = x1i - x3r
-      x0r = y8r + y10r
-      x0i = y8i + y10i
-      x1r = y8r - y10r
-      x1i = y8i - y10i
-      x2r = y9r + y11r
-      x2i = y9i + y11i
-      x3r = y9r - y11r
-      x3i = y9i - y11i
-      a(16) = x0r + x2r
-      a(17) = x0i + x2i
-      a(18) = x0r - x2r
-      a(19) = x0i - x2i
-      a(20) = x1r - x3i
-      a(21) = x1i + x3r
-      a(22) = x1r + x3i
-      a(23) = x1i - x3r
-      x0r = y5r - y7i
-      x0i = y5i + y7r
-      x2r = wn4r * (x0r - x0i)
-      x2i = wn4r * (x0i + x0r)
-      x0r = y5r + y7i
-      x0i = y5i - y7r
-      x3r = wn4r * (x0r - x0i)
-      x3i = wn4r * (x0i + x0r)
-      x0r = y4r - y6i
-      x0i = y4i + y6r
-      x1r = y4r + y6i
-      x1i = y4i - y6r
-      a(8) = x0r + x2r
-      a(9) = x0i + x2i
-      a(10) = x0r - x2r
-      a(11) = x0i - x2i
-      a(12) = x1r - x3i
-      a(13) = x1i + x3r
-      a(14) = x1r + x3i
-      a(15) = x1i - x3r
-      x0r = y0r + y2r
-      x0i = y0i + y2i
-      x1r = y0r - y2r
-      x1i = y0i - y2i
-      x2r = y1r + y3r
-      x2i = y1i + y3i
-      x3r = y1r - y3r
-      x3i = y1i - y3i
-      a(0) = x0r + x2r
-      a(1) = x0i + x2i
-      a(2) = x0r - x2r
-      a(3) = x0i - x2i
-      a(4) = x1r - x3i
-      a(5) = x1i + x3r
-      a(6) = x1r + x3i
-      a(7) = x1i - x3r
-      end
-!
-      subroutine cftf162(a, w)
-      real*8 a(0 : 31), w(0 : *)
-      real*8 wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
-      real*8 x0r, x0i, x1r, x1i, x2r, x2i
-      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
-      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
-      real*8 y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i
-      real*8 y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i
-      wn4r = w(1)
-      wk1r = w(4)
-      wk1i = w(5)
-      wk3r = w(6)
-      wk3i = -w(7)
-      wk2r = w(8)
-      wk2i = w(9)
-      x1r = a(0) - a(17)
-      x1i = a(1) + a(16)
-      x0r = a(8) - a(25)
-      x0i = a(9) + a(24)
-      x2r = wn4r * (x0r - x0i)
-      x2i = wn4r * (x0i + x0r)
-      y0r = x1r + x2r
-      y0i = x1i + x2i
-      y4r = x1r - x2r
-      y4i = x1i - x2i
-      x1r = a(0) + a(17)
-      x1i = a(1) - a(16)
-      x0r = a(8) + a(25)
-      x0i = a(9) - a(24)
-      x2r = wn4r * (x0r - x0i)
-      x2i = wn4r * (x0i + x0r)
-      y8r = x1r - x2i
-      y8i = x1i + x2r
-      y12r = x1r + x2i
-      y12i = x1i - x2r
-      x0r = a(2) - a(19)
-      x0i = a(3) + a(18)
-      x1r = wk1r * x0r - wk1i * x0i
-      x1i = wk1r * x0i + wk1i * x0r
-      x0r = a(10) - a(27)
-      x0i = a(11) + a(26)
-      x2r = wk3i * x0r - wk3r * x0i
-      x2i = wk3i * x0i + wk3r * x0r
-      y1r = x1r + x2r
-      y1i = x1i + x2i
-      y5r = x1r - x2r
-      y5i = x1i - x2i
-      x0r = a(2) + a(19)
-      x0i = a(3) - a(18)
-      x1r = wk3r * x0r - wk3i * x0i
-      x1i = wk3r * x0i + wk3i * x0r
-      x0r = a(10) + a(27)
-      x0i = a(11) - a(26)
-      x2r = wk1r * x0r + wk1i * x0i
-      x2i = wk1r * x0i - wk1i * x0r
-      y9r = x1r - x2r
-      y9i = x1i - x2i
-      y13r = x1r + x2r
-      y13i = x1i + x2i
-      x0r = a(4) - a(21)
-      x0i = a(5) + a(20)
-      x1r = wk2r * x0r - wk2i * x0i
-      x1i = wk2r * x0i + wk2i * x0r
-      x0r = a(12) - a(29)
-      x0i = a(13) + a(28)
-      x2r = wk2i * x0r - wk2r * x0i
-      x2i = wk2i * x0i + wk2r * x0r
-      y2r = x1r + x2r
-      y2i = x1i + x2i
-      y6r = x1r - x2r
-      y6i = x1i - x2i
-      x0r = a(4) + a(21)
-      x0i = a(5) - a(20)
-      x1r = wk2i * x0r - wk2r * x0i
-      x1i = wk2i * x0i + wk2r * x0r
-      x0r = a(12) + a(29)
-      x0i = a(13) - a(28)
-      x2r = wk2r * x0r - wk2i * x0i
-      x2i = wk2r * x0i + wk2i * x0r
-      y10r = x1r - x2r
-      y10i = x1i - x2i
-      y14r = x1r + x2r
-      y14i = x1i + x2i
-      x0r = a(6) - a(23)
-      x0i = a(7) + a(22)
-      x1r = wk3r * x0r - wk3i * x0i
-      x1i = wk3r * x0i + wk3i * x0r
-      x0r = a(14) - a(31)
-      x0i = a(15) + a(30)
-      x2r = wk1i * x0r - wk1r * x0i
-      x2i = wk1i * x0i + wk1r * x0r
-      y3r = x1r + x2r
-      y3i = x1i + x2i
-      y7r = x1r - x2r
-      y7i = x1i - x2i
-      x0r = a(6) + a(23)
-      x0i = a(7) - a(22)
-      x1r = wk1i * x0r + wk1r * x0i
-      x1i = wk1i * x0i - wk1r * x0r
-      x0r = a(14) + a(31)
-      x0i = a(15) - a(30)
-      x2r = wk3i * x0r - wk3r * x0i
-      x2i = wk3i * x0i + wk3r * x0r
-      y11r = x1r + x2r
-      y11i = x1i + x2i
-      y15r = x1r - x2r
-      y15i = x1i - x2i
-      x1r = y0r + y2r
-      x1i = y0i + y2i
-      x2r = y1r + y3r
-      x2i = y1i + y3i
-      a(0) = x1r + x2r
-      a(1) = x1i + x2i
-      a(2) = x1r - x2r
-      a(3) = x1i - x2i
-      x1r = y0r - y2r
-      x1i = y0i - y2i
-      x2r = y1r - y3r
-      x2i = y1i - y3i
-      a(4) = x1r - x2i
-      a(5) = x1i + x2r
-      a(6) = x1r + x2i
-      a(7) = x1i - x2r
-      x1r = y4r - y6i
-      x1i = y4i + y6r
-      x0r = y5r - y7i
-      x0i = y5i + y7r
-      x2r = wn4r * (x0r - x0i)
-      x2i = wn4r * (x0i + x0r)
-      a(8) = x1r + x2r
-      a(9) = x1i + x2i
-      a(10) = x1r - x2r
-      a(11) = x1i - x2i
-      x1r = y4r + y6i
-      x1i = y4i - y6r
-      x0r = y5r + y7i
-      x0i = y5i - y7r
-      x2r = wn4r * (x0r - x0i)
-      x2i = wn4r * (x0i + x0r)
-      a(12) = x1r - x2i
-      a(13) = x1i + x2r
-      a(14) = x1r + x2i
-      a(15) = x1i - x2r
-      x1r = y8r + y10r
-      x1i = y8i + y10i
-      x2r = y9r - y11r
-      x2i = y9i - y11i
-      a(16) = x1r + x2r
-      a(17) = x1i + x2i
-      a(18) = x1r - x2r
-      a(19) = x1i - x2i
-      x1r = y8r - y10r
-      x1i = y8i - y10i
-      x2r = y9r + y11r
-      x2i = y9i + y11i
-      a(20) = x1r - x2i
-      a(21) = x1i + x2r
-      a(22) = x1r + x2i
-      a(23) = x1i - x2r
-      x1r = y12r - y14i
-      x1i = y12i + y14r
-      x0r = y13r + y15i
-      x0i = y13i - y15r
-      x2r = wn4r * (x0r - x0i)
-      x2i = wn4r * (x0i + x0r)
-      a(24) = x1r + x2r
-      a(25) = x1i + x2i
-      a(26) = x1r - x2r
-      a(27) = x1i - x2i
-      x1r = y12r + y14i
-      x1i = y12i - y14r
-      x0r = y13r - y15i
-      x0i = y13i + y15r
-      x2r = wn4r * (x0r - x0i)
-      x2i = wn4r * (x0i + x0r)
-      a(28) = x1r - x2i
-      a(29) = x1i + x2r
-      a(30) = x1r + x2i
-      a(31) = x1i - x2r
-      end
-!
-      subroutine cftf081(a, w)
-      real*8 a(0 : 15), w(0 : *)
-      real*8 wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
-      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
-      wn4r = w(1)
-      x0r = a(0) + a(8)
-      x0i = a(1) + a(9)
-      x1r = a(0) - a(8)
-      x1i = a(1) - a(9)
-      x2r = a(4) + a(12)
-      x2i = a(5) + a(13)
-      x3r = a(4) - a(12)
-      x3i = a(5) - a(13)
-      y0r = x0r + x2r
-      y0i = x0i + x2i
-      y2r = x0r - x2r
-      y2i = x0i - x2i
-      y1r = x1r - x3i
-      y1i = x1i + x3r
-      y3r = x1r + x3i
-      y3i = x1i - x3r
-      x0r = a(2) + a(10)
-      x0i = a(3) + a(11)
-      x1r = a(2) - a(10)
-      x1i = a(3) - a(11)
-      x2r = a(6) + a(14)
-      x2i = a(7) + a(15)
-      x3r = a(6) - a(14)
-      x3i = a(7) - a(15)
-      y4r = x0r + x2r
-      y4i = x0i + x2i
-      y6r = x0r - x2r
-      y6i = x0i - x2i
-      x0r = x1r - x3i
-      x0i = x1i + x3r
-      x2r = x1r + x3i
-      x2i = x1i - x3r
-      y5r = wn4r * (x0r - x0i)
-      y5i = wn4r * (x0r + x0i)
-      y7r = wn4r * (x2r - x2i)
-      y7i = wn4r * (x2r + x2i)
-      a(8) = y1r + y5r
-      a(9) = y1i + y5i
-      a(10) = y1r - y5r
-      a(11) = y1i - y5i
-      a(12) = y3r - y7i
-      a(13) = y3i + y7r
-      a(14) = y3r + y7i
-      a(15) = y3i - y7r
-      a(0) = y0r + y4r
-      a(1) = y0i + y4i
-      a(2) = y0r - y4r
-      a(3) = y0i - y4i
-      a(4) = y2r - y6i
-      a(5) = y2i + y6r
-      a(6) = y2r + y6i
-      a(7) = y2i - y6r
-      end
-!
-      subroutine cftf082(a, w)
-      real*8 a(0 : 15), w(0 : *)
-      real*8 wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i
-      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
-      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
-      wn4r = w(1)
-      wk1r = w(2)
-      wk1i = w(3)
-      y0r = a(0) - a(9)
-      y0i = a(1) + a(8)
-      y1r = a(0) + a(9)
-      y1i = a(1) - a(8)
-      x0r = a(4) - a(13)
-      x0i = a(5) + a(12)
-      y2r = wn4r * (x0r - x0i)
-      y2i = wn4r * (x0i + x0r)
-      x0r = a(4) + a(13)
-      x0i = a(5) - a(12)
-      y3r = wn4r * (x0r - x0i)
-      y3i = wn4r * (x0i + x0r)
-      x0r = a(2) - a(11)
-      x0i = a(3) + a(10)
-      y4r = wk1r * x0r - wk1i * x0i
-      y4i = wk1r * x0i + wk1i * x0r
-      x0r = a(2) + a(11)
-      x0i = a(3) - a(10)
-      y5r = wk1i * x0r - wk1r * x0i
-      y5i = wk1i * x0i + wk1r * x0r
-      x0r = a(6) - a(15)
-      x0i = a(7) + a(14)
-      y6r = wk1i * x0r - wk1r * x0i
-      y6i = wk1i * x0i + wk1r * x0r
-      x0r = a(6) + a(15)
-      x0i = a(7) - a(14)
-      y7r = wk1r * x0r - wk1i * x0i
-      y7i = wk1r * x0i + wk1i * x0r
-      x0r = y0r + y2r
-      x0i = y0i + y2i
-      x1r = y4r + y6r
-      x1i = y4i + y6i
-      a(0) = x0r + x1r
-      a(1) = x0i + x1i
-      a(2) = x0r - x1r
-      a(3) = x0i - x1i
-      x0r = y0r - y2r
-      x0i = y0i - y2i
-      x1r = y4r - y6r
-      x1i = y4i - y6i
-      a(4) = x0r - x1i
-      a(5) = x0i + x1r
-      a(6) = x0r + x1i
-      a(7) = x0i - x1r
-      x0r = y1r - y3i
-      x0i = y1i + y3r
-      x1r = y5r - y7r
-      x1i = y5i - y7i
-      a(8) = x0r + x1r
-      a(9) = x0i + x1i
-      a(10) = x0r - x1r
-      a(11) = x0i - x1i
-      x0r = y1r + y3i
-      x0i = y1i - y3r
-      x1r = y5r + y7r
-      x1i = y5i + y7i
-      a(12) = x0r - x1i
-      a(13) = x0i + x1r
-      a(14) = x0r + x1i
-      a(15) = x0i - x1r
-      end
-!
-      subroutine cftf040(a)
-      real*8 a(0 : 7), x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      x0r = a(0) + a(4)
-      x0i = a(1) + a(5)
-      x1r = a(0) - a(4)
-      x1i = a(1) - a(5)
-      x2r = a(2) + a(6)
-      x2i = a(3) + a(7)
-      x3r = a(2) - a(6)
-      x3i = a(3) - a(7)
-      a(0) = x0r + x2r
-      a(1) = x0i + x2i
-      a(2) = x1r - x3i
-      a(3) = x1i + x3r
-      a(4) = x0r - x2r
-      a(5) = x0i - x2i
-      a(6) = x1r + x3i
-      a(7) = x1i - x3r
-      end
-!
-      subroutine cftb040(a)
-      real*8 a(0 : 7), x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
-      x0r = a(0) + a(4)
-      x0i = a(1) + a(5)
-      x1r = a(0) - a(4)
-      x1i = a(1) - a(5)
-      x2r = a(2) + a(6)
-      x2i = a(3) + a(7)
-      x3r = a(2) - a(6)
-      x3i = a(3) - a(7)
-      a(0) = x0r + x2r
-      a(1) = x0i + x2i
-      a(2) = x1r + x3i
-      a(3) = x1i - x3r
-      a(4) = x0r - x2r
-      a(5) = x0i - x2i
-      a(6) = x1r - x3i
-      a(7) = x1i + x3r
-      end
-!
-      subroutine cftx020(a)
-      real*8 a(0 : 3), x0r, x0i
-      x0r = a(0) - a(2)
-      x0i = a(1) - a(3)
-      a(0) = a(0) + a(2)
-      a(1) = a(1) + a(3)
-      a(2) = x0r
-      a(3) = x0i
-      end
-!
-      subroutine rftfsub(n, a, nc, c)
-      integer n, nc, j, k, kk, ks, m
-      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
-      m = n / 2
-      ks = 2 * nc / m
-      kk = 0
-      do j = 2, m - 2, 2
-          k = n - j
-          kk = kk + ks
-          wkr = 0.5d0 - c(nc - kk)
-          wki = c(kk)
-          xr = a(j) - a(k)
-          xi = a(j + 1) + a(k + 1)
-          yr = wkr * xr - wki * xi
-          yi = wkr * xi + wki * xr
-          a(j) = a(j) - yr
-          a(j + 1) = a(j + 1) - yi
-          a(k) = a(k) + yr
-          a(k + 1) = a(k + 1) - yi
-      end do
-      end
-!
-      subroutine rftbsub(n, a, nc, c)
-      integer n, nc, j, k, kk, ks, m
-      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
-      m = n / 2
-      ks = 2 * nc / m
-      kk = 0
-      do j = 2, m - 2, 2
-          k = n - j
-          kk = kk + ks
-          wkr = 0.5d0 - c(nc - kk)
-          wki = c(kk)
-          xr = a(j) - a(k)
-          xi = a(j + 1) + a(k + 1)
-          yr = wkr * xr + wki * xi
-          yi = wkr * xi - wki * xr
-          a(j) = a(j) - yr
-          a(j + 1) = a(j + 1) - yi
-          a(k) = a(k) + yr
-          a(k + 1) = a(k + 1) - yi
-      end do
-      end
-!
-      subroutine dctsub(n, a, nc, c)
-      integer n, nc, j, k, kk, ks, m
-      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
-      m = n / 2
-      ks = nc / n
-      kk = 0
-      do j = 1, m - 1
-          k = n - j
-          kk = kk + ks
-          wkr = c(kk) - c(nc - kk)
-          wki = c(kk) + c(nc - kk)
-          xr = wki * a(j) - wkr * a(k)
-          a(j) = wkr * a(j) + wki * a(k)
-          a(k) = xr
-      end do
-      a(m) = c(0) * a(m)
-      end
-!
-      subroutine dstsub(n, a, nc, c)
-      integer n, nc, j, k, kk, ks, m
-      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
-      m = n / 2
-      ks = nc / n
-      kk = 0
-      do j = 1, m - 1
-          k = n - j
-          kk = kk + ks
-          wkr = c(kk) - c(nc - kk)
-          wki = c(kk) + c(nc - kk)
-          xr = wki * a(k) - wkr * a(j)
-          a(k) = wkr * a(k) + wki * a(j)
-          a(j) = xr
-      end do
-      a(m) = c(0) * a(m)
-      end
-!
diff --git a/third_party/tensorflow_dependencies/fft2d/fftsg2d.c b/third_party/tensorflow_dependencies/fft2d/fftsg2d.c
deleted file mode 100644
index 2ebfa93..0000000
--- a/third_party/tensorflow_dependencies/fft2d/fftsg2d.c
+++ /dev/null
@@ -1,1190 +0,0 @@
-/*
-Fast Fourier/Cosine/Sine Transform
-    dimension   :two
-    data length :power of 2
-    decimation  :frequency
-    radix       :split-radix, row-column
-    data        :inplace
-    table       :use
-functions
-    cdft2d: Complex Discrete Fourier Transform
-    rdft2d: Real Discrete Fourier Transform
-    ddct2d: Discrete Cosine Transform
-    ddst2d: Discrete Sine Transform
-function prototypes
-    void cdft2d(int, int, int, double **, double *, int *, double *);
-    void rdft2d(int, int, int, double **, double *, int *, double *);
-    void rdft2dsort(int, int, int, double **);
-    void ddct2d(int, int, int, double **, double *, int *, double *);
-    void ddst2d(int, int, int, double **, double *, int *, double *);
-necessary package
-    fftsg.c  : 1D-FFT package
-macro definitions
-    USE_FFT2D_PTHREADS : default=not defined
-        FFT2D_MAX_THREADS     : must be 2^N, default=4
-        FFT2D_THREADS_BEGIN_N : default=65536
-    USE_FFT2D_WINTHREADS : default=not defined
-        FFT2D_MAX_THREADS     : must be 2^N, default=4
-        FFT2D_THREADS_BEGIN_N : default=131072
-
-
--------- Complex DFT (Discrete Fourier Transform) --------
-    [definition]
-        <case1>
-            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * 
-                            exp(2*pi*i*j1*k1/n1) * 
-                            exp(2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
-        <case2>
-            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * 
-                            exp(-2*pi*i*j1*k1/n1) * 
-                            exp(-2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
-        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            cdft2d(n1, 2*n2, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            cdft2d(n1, 2*n2, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 1, n1 = power of 2
-        2*n2   :data length (int)
-                n2 >= 1, n2 = power of 2
-        a[0...n1-1][0...2*n2-1]
-               :input/output data (double **)
-                input data
-                    a[j1][2*j2] = Re(x[j1][j2]), 
-                    a[j1][2*j2+1] = Im(x[j1][j2]), 
-                    0<=j1<n1, 0<=j2<n2
-                output data
-                    a[k1][2*k2] = Re(X[k1][k2]), 
-                    a[k1][2*k2+1] = Im(X[k1][k2]), 
-                    0<=k1<n1, 0<=k2<n2
-        t[0...*]
-               :work area (double *)
-                length of t >= 8*n1,                   if single thread, 
-                length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads, 
-                t is dynamically allocated, if t == NULL.
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1, n2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1/2, n2/2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            cdft2d(n1, 2*n2, -1, a, t, ip, w);
-        is 
-            cdft2d(n1, 2*n2, 1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= 2 * n2 - 1; j2++) {
-                    a[j1][j2] *= 1.0 / n1 / n2;
-                }
-            }
-        .
-
-
--------- Real DFT / Inverse of Real DFT --------
-    [definition]
-        <case1> RDFT
-            R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
-                            0<=k1<n1, 0<=k2<n2
-            I[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
-                            0<=k1<n1, 0<=k2<n2
-        <case2> IRDFT (excluding scale)
-            a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
-                            (R[j1][j2] * 
-                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + 
-                            I[j1][j2] * 
-                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 
-                            0<=k1<n1, 0<=k2<n2
-        (notes: R[n1-k1][n2-k2] = R[k1][k2], 
-                I[n1-k1][n2-k2] = -I[k1][k2], 
-                R[n1-k1][0] = R[k1][0], 
-                I[n1-k1][0] = -I[k1][0], 
-                R[0][n2-k2] = R[0][k2], 
-                I[0][n2-k2] = -I[0][k2], 
-                0<k1<n1, 0<k2<n2)
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            rdft2d(n1, n2, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            rdft2d(n1, n2, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        a[0...n1-1][0...n2-1]
-               :input/output data (double **)
-                <case1>
-                    output data
-                        a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2], 
-                        a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2], 
-                            0<k1<n1, 0<k2<n2/2, 
-                        a[0][2*k2] = R[0][k2] = R[0][n2-k2], 
-                        a[0][2*k2+1] = I[0][k2] = -I[0][n2-k2], 
-                            0<k2<n2/2, 
-                        a[k1][0] = R[k1][0] = R[n1-k1][0], 
-                        a[k1][1] = I[k1][0] = -I[n1-k1][0], 
-                        a[n1-k1][1] = R[k1][n2/2] = R[n1-k1][n2/2], 
-                        a[n1-k1][0] = -I[k1][n2/2] = I[n1-k1][n2/2], 
-                            0<k1<n1/2, 
-                        a[0][0] = R[0][0], 
-                        a[0][1] = R[0][n2/2], 
-                        a[n1/2][0] = R[n1/2][0], 
-                        a[n1/2][1] = R[n1/2][n2/2]
-                <case2>
-                    input data
-                        a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2], 
-                        a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2], 
-                            0<j1<n1, 0<j2<n2/2, 
-                        a[0][2*j2] = R[0][j2] = R[0][n2-j2], 
-                        a[0][2*j2+1] = I[0][j2] = -I[0][n2-j2], 
-                            0<j2<n2/2, 
-                        a[j1][0] = R[j1][0] = R[n1-j1][0], 
-                        a[j1][1] = I[j1][0] = -I[n1-j1][0], 
-                        a[n1-j1][1] = R[j1][n2/2] = R[n1-j1][n2/2], 
-                        a[n1-j1][0] = -I[j1][n2/2] = I[n1-j1][n2/2], 
-                            0<j1<n1/2, 
-                        a[0][0] = R[0][0], 
-                        a[0][1] = R[0][n2/2], 
-                        a[n1/2][0] = R[n1/2][0], 
-                        a[n1/2][1] = R[n1/2][n2/2]
-                ---- output ordering ----
-                    rdft2d(n1, n2, 1, a, t, ip, w);
-                    rdft2dsort(n1, n2, 1, a);
-                    // stored data is a[0...n1-1][0...n2+1]:
-                    // a[k1][2*k2] = R[k1][k2], 
-                    // a[k1][2*k2+1] = I[k1][k2], 
-                    // 0<=k1<n1, 0<=k2<=n2/2.
-                    // the stored data is larger than the input data!
-                ---- input ordering ----
-                    rdft2dsort(n1, n2, -1, a);
-                    rdft2d(n1, n2, -1, a, t, ip, w);
-        t[0...*]
-               :work area (double *)
-                length of t >= 8*n1,                   if single thread, 
-                length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads, 
-                t is dynamically allocated, if t == NULL.
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1, n2/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1/2, n2/4) + n2/4
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            rdft2d(n1, n2, 1, a, t, ip, w);
-        is 
-            rdft2d(n1, n2, -1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    a[j1][j2] *= 2.0 / n1 / n2;
-                }
-            }
-        .
-
-
--------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
-    [definition]
-        <case1> IDCT (excluding scale)
-            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            cos(pi*j1*(k1+1/2)/n1) * 
-                            cos(pi*j2*(k2+1/2)/n2), 
-                            0<=k1<n1, 0<=k2<n2
-        <case2> DCT
-            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            cos(pi*(j1+1/2)*k1/n1) * 
-                            cos(pi*(j2+1/2)*k2/n2), 
-                            0<=k1<n1, 0<=k2<n2
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            ddct2d(n1, n2, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            ddct2d(n1, n2, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        a[0...n1-1][0...n2-1]
-               :input/output data (double **)
-                output data
-                    a[k1][k2] = C[k1][k2], 0<=k1<n1, 0<=k2<n2
-        t[0...*]
-               :work area (double *)
-                length of t >= 4*n1,                   if single thread, 
-                length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads, 
-                t is dynamically allocated, if t == NULL.
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1/2, n2/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1*3/2, n2*3/2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            ddct2d(n1, n2, -1, a, t, ip, w);
-        is 
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                a[j1][0] *= 0.5;
-            }
-            for (j2 = 0; j2 <= n2 - 1; j2++) {
-                a[0][j2] *= 0.5;
-            }
-            ddct2d(n1, n2, 1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    a[j1][j2] *= 4.0 / n1 / n2;
-                }
-            }
-        .
-
-
--------- DST (Discrete Sine Transform) / Inverse of DST --------
-    [definition]
-        <case1> IDST (excluding scale)
-            S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] * 
-                            sin(pi*j1*(k1+1/2)/n1) * 
-                            sin(pi*j2*(k2+1/2)/n2), 
-                            0<=k1<n1, 0<=k2<n2
-        <case2> DST
-            S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
-                            sin(pi*(j1+1/2)*k1/n1) * 
-                            sin(pi*(j2+1/2)*k2/n2), 
-                            0<k1<=n1, 0<k2<=n2
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            ddst2d(n1, n2, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            ddst2d(n1, n2, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        a[0...n1-1][0...n2-1]
-               :input/output data (double **)
-                <case1>
-                    input data
-                        a[j1][j2] = A[j1][j2], 0<j1<n1, 0<j2<n2, 
-                        a[j1][0] = A[j1][n2], 0<j1<n1, 
-                        a[0][j2] = A[n1][j2], 0<j2<n2, 
-                        a[0][0] = A[n1][n2]
-                        (i.e. A[j1][j2] = a[j1 % n1][j2 % n2])
-                    output data
-                        a[k1][k2] = S[k1][k2], 0<=k1<n1, 0<=k2<n2
-                <case2>
-                    output data
-                        a[k1][k2] = S[k1][k2], 0<k1<n1, 0<k2<n2, 
-                        a[k1][0] = S[k1][n2], 0<k1<n1, 
-                        a[0][k2] = S[n1][k2], 0<k2<n2, 
-                        a[0][0] = S[n1][n2]
-                        (i.e. S[k1][k2] = a[k1 % n1][k2 % n2])
-        t[0...*]
-               :work area (double *)
-                length of t >= 4*n1,                   if single thread, 
-                length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads, 
-                t is dynamically allocated, if t == NULL.
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1/2, n2/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1*3/2, n2*3/2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            ddst2d(n1, n2, -1, a, t, ip, w);
-        is 
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                a[j1][0] *= 0.5;
-            }
-            for (j2 = 0; j2 <= n2 - 1; j2++) {
-                a[0][j2] *= 0.5;
-            }
-            ddst2d(n1, n2, 1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    a[j1][j2] *= 4.0 / n1 / n2;
-                }
-            }
-        .
-*/
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#define fft2d_alloc_error_check(p) { \
-    if ((p) == NULL) { \
-        fprintf(stderr, "fft2d memory allocation error\n"); \
-        exit(1); \
-    } \
-}
-
-
-#ifdef USE_FFT2D_PTHREADS
-#define USE_FFT2D_THREADS
-#ifndef FFT2D_MAX_THREADS
-#define FFT2D_MAX_THREADS 4
-#endif
-#ifndef FFT2D_THREADS_BEGIN_N
-#define FFT2D_THREADS_BEGIN_N 65536
-#endif
-#include <pthread.h>
-#define fft2d_thread_t pthread_t
-#define fft2d_thread_create(thp,func,argp) { \
-    if (pthread_create(thp, NULL, func, (void *) (argp)) != 0) { \
-        fprintf(stderr, "fft2d thread error\n"); \
-        exit(1); \
-    } \
-}
-#define fft2d_thread_wait(th) { \
-    if (pthread_join(th, NULL) != 0) { \
-        fprintf(stderr, "fft2d thread error\n"); \
-        exit(1); \
-    } \
-}
-#endif /* USE_FFT2D_PTHREADS */
-
-
-#ifdef USE_FFT2D_WINTHREADS
-#define USE_FFT2D_THREADS
-#ifndef FFT2D_MAX_THREADS
-#define FFT2D_MAX_THREADS 4
-#endif
-#ifndef FFT2D_THREADS_BEGIN_N
-#define FFT2D_THREADS_BEGIN_N 131072
-#endif
-#include <windows.h>
-#define fft2d_thread_t HANDLE
-#define fft2d_thread_create(thp,func,argp) { \
-    DWORD thid; \
-    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) (func), (LPVOID) (argp), 0, &thid); \
-    if (*(thp) == 0) { \
-        fprintf(stderr, "fft2d thread error\n"); \
-        exit(1); \
-    } \
-}
-#define fft2d_thread_wait(th) { \
-    WaitForSingleObject(th, INFINITE); \
-    CloseHandle(th); \
-}
-#endif /* USE_FFT2D_WINTHREADS */
-
-
-void cdft2d(int n1, int n2, int isgn, double **a, double *t, 
-    int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void cdft(int n, int isgn, double *a, int *ip, double *w);
-    void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t, 
-        int *ip, double *w);
-#ifdef USE_FFT2D_THREADS
-    void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a, 
-        int *ip, double *w);
-    void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t, 
-        int *ip, double *w);
-#endif /* USE_FFT2D_THREADS */
-    int n, itnull, nthread, nt, i;
-    
-    n = n1 << 1;
-    if (n < n2) {
-        n = n2;
-    }
-    if (n > (ip[0] << 2)) {
-        makewt(n >> 2, ip, w);
-    }
-    itnull = 0;
-    if (t == NULL) {
-        itnull = 1;
-        nthread = 1;
-#ifdef USE_FFT2D_THREADS
-        nthread = FFT2D_MAX_THREADS;
-#endif /* USE_FFT2D_THREADS */
-        nt = 8 * nthread * n1;
-        if (n2 == 4 * nthread) {
-            nt >>= 1;
-        } else if (n2 < 4 * nthread) {
-            nt >>= 2;
-        }
-        t = (double *) malloc(sizeof(double) * nt);
-        fft2d_alloc_error_check(t);
-    }
-#ifdef USE_FFT2D_THREADS
-    if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
-        xdft2d0_subth(n1, n2, 0, isgn, a, ip, w);
-        cdft2d_subth(n1, n2, isgn, a, t, ip, w);
-    } else 
-#endif /* USE_FFT2D_THREADS */
-    {
-        for (i = 0; i < n1; i++) {
-            cdft(n2, isgn, a[i], ip, w);
-        }
-        cdft2d_sub(n1, n2, isgn, a, t, ip, w);
-    }
-    if (itnull != 0) {
-        free(t);
-    }
-}
-
-
-void rdft2d(int n1, int n2, int isgn, double **a, double *t, 
-    int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void rdft(int n, int isgn, double *a, int *ip, double *w);
-    void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t, 
-        int *ip, double *w);
-    void rdft2d_sub(int n1, int n2, int isgn, double **a);
-#ifdef USE_FFT2D_THREADS
-    void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a, 
-        int *ip, double *w);
-    void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t, 
-        int *ip, double *w);
-#endif /* USE_FFT2D_THREADS */
-    int n, nw, nc, itnull, nthread, nt, i;
-    
-    n = n1 << 1;
-    if (n < n2) {
-        n = n2;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n2 > (nc << 2)) {
-        nc = n2 >> 2;
-        makect(nc, ip, w + nw);
-    }
-    itnull = 0;
-    if (t == NULL) {
-        itnull = 1;
-        nthread = 1;
-#ifdef USE_FFT2D_THREADS
-        nthread = FFT2D_MAX_THREADS;
-#endif /* USE_FFT2D_THREADS */
-        nt = 8 * nthread * n1;
-        if (n2 == 4 * nthread) {
-            nt >>= 1;
-        } else if (n2 < 4 * nthread) {
-            nt >>= 2;
-        }
-        t = (double *) malloc(sizeof(double) * nt);
-        fft2d_alloc_error_check(t);
-    }
-#ifdef USE_FFT2D_THREADS
-    if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
-        if (isgn < 0) {
-            rdft2d_sub(n1, n2, isgn, a);
-            cdft2d_subth(n1, n2, isgn, a, t, ip, w);
-        }
-        xdft2d0_subth(n1, n2, 1, isgn, a, ip, w);
-        if (isgn >= 0) {
-            cdft2d_subth(n1, n2, isgn, a, t, ip, w);
-            rdft2d_sub(n1, n2, isgn, a);
-        }
-    } else 
-#endif /* USE_FFT2D_THREADS */
-    {
-        if (isgn < 0) {
-            rdft2d_sub(n1, n2, isgn, a);
-            cdft2d_sub(n1, n2, isgn, a, t, ip, w);
-        }
-        for (i = 0; i < n1; i++) {
-            rdft(n2, isgn, a[i], ip, w);
-        }
-        if (isgn >= 0) {
-            cdft2d_sub(n1, n2, isgn, a, t, ip, w);
-            rdft2d_sub(n1, n2, isgn, a);
-        }
-    }
-    if (itnull != 0) {
-        free(t);
-    }
-}
-
-
-void rdft2dsort(int n1, int n2, int isgn, double **a)
-{
-    int n1h, i;
-    double x, y;
-    
-    n1h = n1 >> 1;
-    if (isgn < 0) {
-        for (i = n1h + 1; i < n1; i++) {
-            a[i][0] = a[i][n2 + 1];
-            a[i][1] = a[i][n2];
-        }
-        a[0][1] = a[0][n2];
-        a[n1h][1] = a[n1h][n2];
-    } else {
-        for (i = n1h + 1; i < n1; i++) {
-            y = a[i][0];
-            x = a[i][1];
-            a[i][n2] = x;
-            a[i][n2 + 1] = y;
-            a[n1 - i][n2] = x;
-            a[n1 - i][n2 + 1] = -y;
-            a[i][0] = a[n1 - i][0];
-            a[i][1] = -a[n1 - i][1];
-        }
-        a[0][n2] = a[0][1];
-        a[0][n2 + 1] = 0;
-        a[0][1] = 0;
-        a[n1h][n2] = a[n1h][1];
-        a[n1h][n2 + 1] = 0;
-        a[n1h][1] = 0;
-    }
-}
-
-
-void ddct2d(int n1, int n2, int isgn, double **a, double *t, 
-    int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void ddct(int n, int isgn, double *a, int *ip, double *w);
-    void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a, 
-        double *t, int *ip, double *w);
-#ifdef USE_FFT2D_THREADS
-    void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a, 
-        int *ip, double *w);
-    void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a, 
-        double *t, int *ip, double *w);
-#endif /* USE_FFT2D_THREADS */
-    int n, nw, nc, itnull, nthread, nt, i;
-    
-    n = n1;
-    if (n < n2) {
-        n = n2;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > nc) {
-        nc = n;
-        makect(nc, ip, w + nw);
-    }
-    itnull = 0;
-    if (t == NULL) {
-        itnull = 1;
-        nthread = 1;
-#ifdef USE_FFT2D_THREADS
-        nthread = FFT2D_MAX_THREADS;
-#endif /* USE_FFT2D_THREADS */
-        nt = 4 * nthread * n1;
-        if (n2 == 2 * nthread) {
-            nt >>= 1;
-        } else if (n2 < 2 * nthread) {
-            nt >>= 2;
-        }
-        t = (double *) malloc(sizeof(double) * nt);
-        fft2d_alloc_error_check(t);
-    }
-#ifdef USE_FFT2D_THREADS
-    if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
-        ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w);
-        ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w);
-    } else 
-#endif /* USE_FFT2D_THREADS */
-    {
-        for (i = 0; i < n1; i++) {
-            ddct(n2, isgn, a[i], ip, w);
-        }
-        ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w);
-    }
-    if (itnull != 0) {
-        free(t);
-    }
-}
-
-
-void ddst2d(int n1, int n2, int isgn, double **a, double *t, 
-    int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void ddst(int n, int isgn, double *a, int *ip, double *w);
-    void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a, 
-        double *t, int *ip, double *w);
-#ifdef USE_FFT2D_THREADS
-    void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a, 
-        int *ip, double *w);
-    void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a, 
-        double *t, int *ip, double *w);
-#endif /* USE_FFT2D_THREADS */
-    int n, nw, nc, itnull, nthread, nt, i;
-    
-    n = n1;
-    if (n < n2) {
-        n = n2;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > nc) {
-        nc = n;
-        makect(nc, ip, w + nw);
-    }
-    itnull = 0;
-    if (t == NULL) {
-        itnull = 1;
-        nthread = 1;
-#ifdef USE_FFT2D_THREADS
-        nthread = FFT2D_MAX_THREADS;
-#endif /* USE_FFT2D_THREADS */
-        nt = 4 * nthread * n1;
-        if (n2 == 2 * nthread) {
-            nt >>= 1;
-        } else if (n2 < 2 * nthread) {
-            nt >>= 2;
-        }
-        t = (double *) malloc(sizeof(double) * nt);
-        fft2d_alloc_error_check(t);
-    }
-#ifdef USE_FFT2D_THREADS
-    if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
-        ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w);
-        ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w);
-    } else 
-#endif /* USE_FFT2D_THREADS */
-    {
-        for (i = 0; i < n1; i++) {
-            ddst(n2, isgn, a[i], ip, w);
-        }
-        ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w);
-    }
-    if (itnull != 0) {
-        free(t);
-    }
-}
-
-
-/* -------- child routines -------- */
-
-
-void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t, 
-    int *ip, double *w)
-{
-    void cdft(int n, int isgn, double *a, int *ip, double *w);
-    int i, j;
-    
-    if (n2 > 4) {
-        for (j = 0; j < n2; j += 8) {
-            for (i = 0; i < n1; i++) {
-                t[2 * i] = a[i][j];
-                t[2 * i + 1] = a[i][j + 1];
-                t[2 * n1 + 2 * i] = a[i][j + 2];
-                t[2 * n1 + 2 * i + 1] = a[i][j + 3];
-                t[4 * n1 + 2 * i] = a[i][j + 4];
-                t[4 * n1 + 2 * i + 1] = a[i][j + 5];
-                t[6 * n1 + 2 * i] = a[i][j + 6];
-                t[6 * n1 + 2 * i + 1] = a[i][j + 7];
-            }
-            cdft(2 * n1, isgn, t, ip, w);
-            cdft(2 * n1, isgn, &t[2 * n1], ip, w);
-            cdft(2 * n1, isgn, &t[4 * n1], ip, w);
-            cdft(2 * n1, isgn, &t[6 * n1], ip, w);
-            for (i = 0; i < n1; i++) {
-                a[i][j] = t[2 * i];
-                a[i][j + 1] = t[2 * i + 1];
-                a[i][j + 2] = t[2 * n1 + 2 * i];
-                a[i][j + 3] = t[2 * n1 + 2 * i + 1];
-                a[i][j + 4] = t[4 * n1 + 2 * i];
-                a[i][j + 5] = t[4 * n1 + 2 * i + 1];
-                a[i][j + 6] = t[6 * n1 + 2 * i];
-                a[i][j + 7] = t[6 * n1 + 2 * i + 1];
-            }
-        }
-    } else if (n2 == 4) {
-        for (i = 0; i < n1; i++) {
-            t[2 * i] = a[i][0];
-            t[2 * i + 1] = a[i][1];
-            t[2 * n1 + 2 * i] = a[i][2];
-            t[2 * n1 + 2 * i + 1] = a[i][3];
-        }
-        cdft(2 * n1, isgn, t, ip, w);
-        cdft(2 * n1, isgn, &t[2 * n1], ip, w);
-        for (i = 0; i < n1; i++) {
-            a[i][0] = t[2 * i];
-            a[i][1] = t[2 * i + 1];
-            a[i][2] = t[2 * n1 + 2 * i];
-            a[i][3] = t[2 * n1 + 2 * i + 1];
-        }
-    } else if (n2 == 2) {
-        for (i = 0; i < n1; i++) {
-            t[2 * i] = a[i][0];
-            t[2 * i + 1] = a[i][1];
-        }
-        cdft(2 * n1, isgn, t, ip, w);
-        for (i = 0; i < n1; i++) {
-            a[i][0] = t[2 * i];
-            a[i][1] = t[2 * i + 1];
-        }
-    }
-}
-
-
-void rdft2d_sub(int n1, int n2, int isgn, double **a)
-{
-    int n1h, i, j;
-    double xi;
-    
-    n1h = n1 >> 1;
-    if (isgn < 0) {
-        for (i = 1; i < n1h; i++) {
-            j = n1 - i;
-            xi = a[i][0] - a[j][0];
-            a[i][0] += a[j][0];
-            a[j][0] = xi;
-            xi = a[j][1] - a[i][1];
-            a[i][1] += a[j][1];
-            a[j][1] = xi;
-        }
-    } else {
-        for (i = 1; i < n1h; i++) {
-            j = n1 - i;
-            a[j][0] = 0.5 * (a[i][0] - a[j][0]);
-            a[i][0] -= a[j][0];
-            a[j][1] = 0.5 * (a[i][1] + a[j][1]);
-            a[i][1] -= a[j][1];
-        }
-    }
-}
-
-
-void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a, 
-    double *t, int *ip, double *w)
-{
-    void ddct(int n, int isgn, double *a, int *ip, double *w);
-    void ddst(int n, int isgn, double *a, int *ip, double *w);
-    int i, j;
-    
-    if (n2 > 2) {
-        for (j = 0; j < n2; j += 4) {
-            for (i = 0; i < n1; i++) {
-                t[i] = a[i][j];
-                t[n1 + i] = a[i][j + 1];
-                t[2 * n1 + i] = a[i][j + 2];
-                t[3 * n1 + i] = a[i][j + 3];
-            }
-            if (ics == 0) {
-                ddct(n1, isgn, t, ip, w);
-                ddct(n1, isgn, &t[n1], ip, w);
-                ddct(n1, isgn, &t[2 * n1], ip, w);
-                ddct(n1, isgn, &t[3 * n1], ip, w);
-            } else {
-                ddst(n1, isgn, t, ip, w);
-                ddst(n1, isgn, &t[n1], ip, w);
-                ddst(n1, isgn, &t[2 * n1], ip, w);
-                ddst(n1, isgn, &t[3 * n1], ip, w);
-            }
-            for (i = 0; i < n1; i++) {
-                a[i][j] = t[i];
-                a[i][j + 1] = t[n1 + i];
-                a[i][j + 2] = t[2 * n1 + i];
-                a[i][j + 3] = t[3 * n1 + i];
-            }
-        }
-    } else if (n2 == 2) {
-        for (i = 0; i < n1; i++) {
-            t[i] = a[i][0];
-            t[n1 + i] = a[i][1];
-        }
-        if (ics == 0) {
-            ddct(n1, isgn, t, ip, w);
-            ddct(n1, isgn, &t[n1], ip, w);
-        } else {
-            ddst(n1, isgn, t, ip, w);
-            ddst(n1, isgn, &t[n1], ip, w);
-        }
-        for (i = 0; i < n1; i++) {
-            a[i][0] = t[i];
-            a[i][1] = t[n1 + i];
-        }
-    }
-}
-
-
-#ifdef USE_FFT2D_THREADS
-struct fft2d_arg_st {
-    int nthread;
-    int n0;
-    int n1;
-    int n2;
-    int ic;
-    int isgn;
-    double **a;
-    double *t;
-    int *ip;
-    double *w;
-};
-typedef struct fft2d_arg_st fft2d_arg_t;
-
-
-void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a, 
-    int *ip, double *w)
-{
-    void *xdft2d0_th(void *p);
-    fft2d_thread_t th[FFT2D_MAX_THREADS];
-    fft2d_arg_t ag[FFT2D_MAX_THREADS];
-    int nthread, i;
-    
-    nthread = FFT2D_MAX_THREADS;
-    if (nthread > n1) {
-        nthread = n1;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].nthread = nthread;
-        ag[i].n0 = i;
-        ag[i].n1 = n1;
-        ag[i].n2 = n2;
-        ag[i].ic = icr;
-        ag[i].isgn = isgn;
-        ag[i].a = a;
-        ag[i].ip = ip;
-        ag[i].w = w;
-        fft2d_thread_create(&th[i], xdft2d0_th, &ag[i]);
-    }
-    for (i = 0; i < nthread; i++) {
-        fft2d_thread_wait(th[i]);
-    }
-}
-
-
-void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t, 
-    int *ip, double *w)
-{
-    void *cdft2d_th(void *p);
-    fft2d_thread_t th[FFT2D_MAX_THREADS];
-    fft2d_arg_t ag[FFT2D_MAX_THREADS];
-    int nthread, nt, i;
-    
-    nthread = FFT2D_MAX_THREADS;
-    nt = 8 * n1;
-    if (n2 == 4 * FFT2D_MAX_THREADS) {
-        nt >>= 1;
-    } else if (n2 < 4 * FFT2D_MAX_THREADS) {
-        nthread = n2 >> 1;
-        nt >>= 2;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].nthread = nthread;
-        ag[i].n0 = i;
-        ag[i].n1 = n1;
-        ag[i].n2 = n2;
-        ag[i].isgn = isgn;
-        ag[i].a = a;
-        ag[i].t = &t[nt * i];
-        ag[i].ip = ip;
-        ag[i].w = w;
-        fft2d_thread_create(&th[i], cdft2d_th, &ag[i]);
-    }
-    for (i = 0; i < nthread; i++) {
-        fft2d_thread_wait(th[i]);
-    }
-}
-
-
-void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a, 
-    int *ip, double *w)
-{
-    void *ddxt2d0_th(void *p);
-    fft2d_thread_t th[FFT2D_MAX_THREADS];
-    fft2d_arg_t ag[FFT2D_MAX_THREADS];
-    int nthread, i;
-    
-    nthread = FFT2D_MAX_THREADS;
-    if (nthread > n1) {
-        nthread = n1;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].nthread = nthread;
-        ag[i].n0 = i;
-        ag[i].n1 = n1;
-        ag[i].n2 = n2;
-        ag[i].ic = ics;
-        ag[i].isgn = isgn;
-        ag[i].a = a;
-        ag[i].ip = ip;
-        ag[i].w = w;
-        fft2d_thread_create(&th[i], ddxt2d0_th, &ag[i]);
-    }
-    for (i = 0; i < nthread; i++) {
-        fft2d_thread_wait(th[i]);
-    }
-}
-
-
-void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a, 
-    double *t, int *ip, double *w)
-{
-    void *ddxt2d_th(void *p);
-    fft2d_thread_t th[FFT2D_MAX_THREADS];
-    fft2d_arg_t ag[FFT2D_MAX_THREADS];
-    int nthread, nt, i;
-    
-    nthread = FFT2D_MAX_THREADS;
-    nt = 4 * n1;
-    if (n2 == 2 * FFT2D_MAX_THREADS) {
-        nt >>= 1;
-    } else if (n2 < 2 * FFT2D_MAX_THREADS) {
-        nthread = n2;
-        nt >>= 2;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].nthread = nthread;
-        ag[i].n0 = i;
-        ag[i].n1 = n1;
-        ag[i].n2 = n2;
-        ag[i].ic = ics;
-        ag[i].isgn = isgn;
-        ag[i].a = a;
-        ag[i].t = &t[nt * i];
-        ag[i].ip = ip;
-        ag[i].w = w;
-        fft2d_thread_create(&th[i], ddxt2d_th, &ag[i]);
-    }
-    for (i = 0; i < nthread; i++) {
-        fft2d_thread_wait(th[i]);
-    }
-}
-
-
-void *xdft2d0_th(void *p)
-{
-    void cdft(int n, int isgn, double *a, int *ip, double *w);
-    void rdft(int n, int isgn, double *a, int *ip, double *w);
-    int nthread, n0, n1, n2, icr, isgn, *ip, i;
-    double **a, *w;
-    
-    nthread = ((fft2d_arg_t *) p)->nthread;
-    n0 = ((fft2d_arg_t *) p)->n0;
-    n1 = ((fft2d_arg_t *) p)->n1;
-    n2 = ((fft2d_arg_t *) p)->n2;
-    icr = ((fft2d_arg_t *) p)->ic;
-    isgn = ((fft2d_arg_t *) p)->isgn;
-    a = ((fft2d_arg_t *) p)->a;
-    ip = ((fft2d_arg_t *) p)->ip;
-    w = ((fft2d_arg_t *) p)->w;
-    if (icr == 0) {
-        for (i = n0; i < n1; i += nthread) {
-            cdft(n2, isgn, a[i], ip, w);
-        }
-    } else {
-        for (i = n0; i < n1; i += nthread) {
-            rdft(n2, isgn, a[i], ip, w);
-        }
-    }
-    return (void *) 0;
-}
-
-
-void *cdft2d_th(void *p)
-{
-    void cdft(int n, int isgn, double *a, int *ip, double *w);
-    int nthread, n0, n1, n2, isgn, *ip, i, j;
-    double **a, *t, *w;
-    
-    nthread = ((fft2d_arg_t *) p)->nthread;
-    n0 = ((fft2d_arg_t *) p)->n0;
-    n1 = ((fft2d_arg_t *) p)->n1;
-    n2 = ((fft2d_arg_t *) p)->n2;
-    isgn = ((fft2d_arg_t *) p)->isgn;
-    a = ((fft2d_arg_t *) p)->a;
-    t = ((fft2d_arg_t *) p)->t;
-    ip = ((fft2d_arg_t *) p)->ip;
-    w = ((fft2d_arg_t *) p)->w;
-    if (n2 > 4 * nthread) {
-        for (j = 8 * n0; j < n2; j += 8 * nthread) {
-            for (i = 0; i < n1; i++) {
-                t[2 * i] = a[i][j];
-                t[2 * i + 1] = a[i][j + 1];
-                t[2 * n1 + 2 * i] = a[i][j + 2];
-                t[2 * n1 + 2 * i + 1] = a[i][j + 3];
-                t[4 * n1 + 2 * i] = a[i][j + 4];
-                t[4 * n1 + 2 * i + 1] = a[i][j + 5];
-                t[6 * n1 + 2 * i] = a[i][j + 6];
-                t[6 * n1 + 2 * i + 1] = a[i][j + 7];
-            }
-            cdft(2 * n1, isgn, t, ip, w);
-            cdft(2 * n1, isgn, &t[2 * n1], ip, w);
-            cdft(2 * n1, isgn, &t[4 * n1], ip, w);
-            cdft(2 * n1, isgn, &t[6 * n1], ip, w);
-            for (i = 0; i < n1; i++) {
-                a[i][j] = t[2 * i];
-                a[i][j + 1] = t[2 * i + 1];
-                a[i][j + 2] = t[2 * n1 + 2 * i];
-                a[i][j + 3] = t[2 * n1 + 2 * i + 1];
-                a[i][j + 4] = t[4 * n1 + 2 * i];
-                a[i][j + 5] = t[4 * n1 + 2 * i + 1];
-                a[i][j + 6] = t[6 * n1 + 2 * i];
-                a[i][j + 7] = t[6 * n1 + 2 * i + 1];
-            }
-        }
-    } else if (n2 == 4 * nthread) {
-        for (i = 0; i < n1; i++) {
-            t[2 * i] = a[i][4 * n0];
-            t[2 * i + 1] = a[i][4 * n0 + 1];
-            t[2 * n1 + 2 * i] = a[i][4 * n0 + 2];
-            t[2 * n1 + 2 * i + 1] = a[i][4 * n0 + 3];
-        }
-        cdft(2 * n1, isgn, t, ip, w);
-        cdft(2 * n1, isgn, &t[2 * n1], ip, w);
-        for (i = 0; i < n1; i++) {
-            a[i][4 * n0] = t[2 * i];
-            a[i][4 * n0 + 1] = t[2 * i + 1];
-            a[i][4 * n0 + 2] = t[2 * n1 + 2 * i];
-            a[i][4 * n0 + 3] = t[2 * n1 + 2 * i + 1];
-        }
-    } else if (n2 == 2 * nthread) {
-        for (i = 0; i < n1; i++) {
-            t[2 * i] = a[i][2 * n0];
-            t[2 * i + 1] = a[i][2 * n0 + 1];
-        }
-        cdft(2 * n1, isgn, t, ip, w);
-        for (i = 0; i < n1; i++) {
-            a[i][2 * n0] = t[2 * i];
-            a[i][2 * n0 + 1] = t[2 * i + 1];
-        }
-    }
-    return (void *) 0;
-}
-
-
-void *ddxt2d0_th(void *p)
-{
-    void ddct(int n, int isgn, double *a, int *ip, double *w);
-    void ddst(int n, int isgn, double *a, int *ip, double *w);
-    int nthread, n0, n1, n2, ics, isgn, *ip, i;
-    double **a, *w;
-    
-    nthread = ((fft2d_arg_t *) p)->nthread;
-    n0 = ((fft2d_arg_t *) p)->n0;
-    n1 = ((fft2d_arg_t *) p)->n1;
-    n2 = ((fft2d_arg_t *) p)->n2;
-    ics = ((fft2d_arg_t *) p)->ic;
-    isgn = ((fft2d_arg_t *) p)->isgn;
-    a = ((fft2d_arg_t *) p)->a;
-    ip = ((fft2d_arg_t *) p)->ip;
-    w = ((fft2d_arg_t *) p)->w;
-    if (ics == 0) {
-        for (i = n0; i < n1; i += nthread) {
-            ddct(n2, isgn, a[i], ip, w);
-        }
-    } else {
-        for (i = n0; i < n1; i += nthread) {
-            ddst(n2, isgn, a[i], ip, w);
-        }
-    }
-    return (void *) 0;
-}
-
-
-void *ddxt2d_th(void *p)
-{
-    void ddct(int n, int isgn, double *a, int *ip, double *w);
-    void ddst(int n, int isgn, double *a, int *ip, double *w);
-    int nthread, n0, n1, n2, ics, isgn, *ip, i, j;
-    double **a, *t, *w;
-    
-    nthread = ((fft2d_arg_t *) p)->nthread;
-    n0 = ((fft2d_arg_t *) p)->n0;
-    n1 = ((fft2d_arg_t *) p)->n1;
-    n2 = ((fft2d_arg_t *) p)->n2;
-    ics = ((fft2d_arg_t *) p)->ic;
-    isgn = ((fft2d_arg_t *) p)->isgn;
-    a = ((fft2d_arg_t *) p)->a;
-    t = ((fft2d_arg_t *) p)->t;
-    ip = ((fft2d_arg_t *) p)->ip;
-    w = ((fft2d_arg_t *) p)->w;
-    if (n2 > 2 * nthread) {
-        for (j = 4 * n0; j < n2; j += 4 * nthread) {
-            for (i = 0; i < n1; i++) {
-                t[i] = a[i][j];
-                t[n1 + i] = a[i][j + 1];
-                t[2 * n1 + i] = a[i][j + 2];
-                t[3 * n1 + i] = a[i][j + 3];
-            }
-            if (ics == 0) {
-                ddct(n1, isgn, t, ip, w);
-                ddct(n1, isgn, &t[n1], ip, w);
-                ddct(n1, isgn, &t[2 * n1], ip, w);
-                ddct(n1, isgn, &t[3 * n1], ip, w);
-            } else {
-                ddst(n1, isgn, t, ip, w);
-                ddst(n1, isgn, &t[n1], ip, w);
-                ddst(n1, isgn, &t[2 * n1], ip, w);
-                ddst(n1, isgn, &t[3 * n1], ip, w);
-            }
-            for (i = 0; i < n1; i++) {
-                a[i][j] = t[i];
-                a[i][j + 1] = t[n1 + i];
-                a[i][j + 2] = t[2 * n1 + i];
-                a[i][j + 3] = t[3 * n1 + i];
-            }
-        }
-    } else if (n2 == 2 * nthread) {
-        for (i = 0; i < n1; i++) {
-            t[i] = a[i][2 * n0];
-            t[n1 + i] = a[i][2 * n0 + 1];
-        }
-        if (ics == 0) {
-            ddct(n1, isgn, t, ip, w);
-            ddct(n1, isgn, &t[n1], ip, w);
-        } else {
-            ddst(n1, isgn, t, ip, w);
-            ddst(n1, isgn, &t[n1], ip, w);
-        }
-        for (i = 0; i < n1; i++) {
-            a[i][2 * n0] = t[i];
-            a[i][2 * n0 + 1] = t[n1 + i];
-        }
-    } else if (n2 == nthread) {
-        for (i = 0; i < n1; i++) {
-            t[i] = a[i][n0];
-        }
-        if (ics == 0) {
-            ddct(n1, isgn, t, ip, w);
-        } else {
-            ddst(n1, isgn, t, ip, w);
-        }
-        for (i = 0; i < n1; i++) {
-            a[i][n0] = t[i];
-        }
-    }
-    return (void *) 0;
-}
-#endif /* USE_FFT2D_THREADS */
-
diff --git a/third_party/tensorflow_dependencies/fft2d/fftsg2d.f b/third_party/tensorflow_dependencies/fft2d/fftsg2d.f
deleted file mode 100644
index 49a9d6c..0000000
--- a/third_party/tensorflow_dependencies/fft2d/fftsg2d.f
+++ /dev/null
@@ -1,562 +0,0 @@
-! Fast Fourier/Cosine/Sine Transform
-!     dimension   :two
-!     data length :power of 2
-!     decimation  :frequency
-!     radix       :split-radix, row-column
-!     data        :inplace
-!     table       :use
-! subroutines
-!     cdft2d: Complex Discrete Fourier Transform
-!     rdft2d: Real Discrete Fourier Transform
-!     ddct2d: Discrete Cosine Transform
-!     ddst2d: Discrete Sine Transform
-! necessary package
-!     fftsg.f  : 1D-FFT package
-!
-!
-! -------- Complex DFT (Discrete Fourier Transform) --------
-!     [definition]
-!         <case1>
-!             X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) * 
-!                            exp(2*pi*i*j1*k1/n1) * 
-!                            exp(2*pi*i*j2*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         <case2>
-!             X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) * 
-!                            exp(-2*pi*i*j1*k1/n1) * 
-!                            exp(-2*pi*i*j2*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call cdft2d(n1max, 2*n1, n2, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call cdft2d(n1max, 2*n1, n2, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row size of the 2D array (integer)
-!         2*n1   :data length (integer)
-!                 n1 >= 1, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 1, n2 = power of 2
-!         a(0:2*n1-1,0:n2-1)
-!                :input/output data (real*8)
-!                 input data
-!                     a(2*j1,j2) = Re(x(j1,j2)), 
-!                     a(2*j1+1,j2) = Im(x(j1,j2)), 
-!                     0<=j1<n1, 0<=j2<n2
-!                 output data
-!                     a(2*k1,k2) = Re(X(k1,k2)), 
-!                     a(2*k1+1,k2) = Im(X(k1,k2)), 
-!                     0<=k1<n1, 0<=k2<n2
-!         t(0:8*n2-1)
-!                :work area (real*8)
-!                 length of t >= 8*n2
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1, n2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1/2, n2/2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call cdft2d(n1max, 2*n1, n2, -1, a, t, ip, w)
-!         is 
-!             call cdft2d(n1max, 2*n1, n2, 1, a, t, ip, w)
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, 2 * n1 - 1
-!                     a(j1, j2) = a(j1, j2) * (1.0d0 / n1 / n2)
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- Real DFT / Inverse of Real DFT --------
-!     [definition]
-!         <case1> RDFT
-!             R(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!             I(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         <case2> IRDFT (excluding scale)
-!             a(k1,k2) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
-!                            (R(j1,j2) * 
-!                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + 
-!                            I(j1,j2) * 
-!                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 
-!                            0<=k1<n1, 0<=k2<n2
-!         (notes: R(n1-k1,n2-k2) = R(k1,k2), 
-!                 I(n1-k1,n2-k2) = -I(k1,k2), 
-!                 R(n1-k1,0) = R(k1,0), 
-!                 I(n1-k1,0) = -I(k1,0), 
-!                 R(0,n2-k2) = R(0,k2), 
-!                 I(0,n2-k2) = -I(0,k2), 
-!                 0<k1<n1, 0<k2<n2)
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row size of the 2D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         a(0:n1-1,0:n2-1)
-!                :input/output data (real*8)
-!                 <case1>
-!                     output data
-!                         a(2*k1,k2) = R(k1,k2) = R(n1-k1,n2-k2), 
-!                         a(2*k1+1,k2) = I(k1,k2) = -I(n1-k1,n2-k2), 
-!                             0<k1<n1/2, 0<k2<n2, 
-!                         a(2*k1,0) = R(k1,0) = R(n1-k1,0), 
-!                         a(2*k1+1,0) = I(k1,0) = -I(n1-k1,0), 
-!                             0<k1<n1/2, 
-!                         a(0,k2) = R(0,k2) = R(0,n2-k2), 
-!                         a(1,k2) = I(0,k2) = -I(0,n2-k2), 
-!                         a(1,n2-k2) = R(n1/2,k2) = R(n1/2,n2-k2), 
-!                         a(0,n2-k2) = -I(n1/2,k2) = I(n1/2,n2-k2), 
-!                             0<k2<n2/2, 
-!                         a(0,0) = R(0,0), 
-!                         a(1,0) = R(n1/2,0), 
-!                         a(0,n2/2) = R(0,n2/2), 
-!                         a(1,n2/2) = R(n1/2,n2/2)
-!                 <case2>
-!                     input data
-!                         a(2*j1,j2) = R(j1,j2) = R(n1-j1,n2-j2), 
-!                         a(2*j1+1,j2) = I(j1,j2) = -I(n1-j1,n2-j2), 
-!                             0<j1<n1/2, 0<j2<n2, 
-!                         a(2*j1,0) = R(j1,0) = R(n1-j1,0), 
-!                         a(2*j1+1,0) = I(j1,0) = -I(n1-j1,0), 
-!                             0<j1<n1/2, 
-!                         a(0,j2) = R(0,j2) = R(0,n2-j2), 
-!                         a(1,j2) = I(0,j2) = -I(0,n2-j2), 
-!                         a(1,n2-j2) = R(n1/2,j2) = R(n1/2,n2-j2), 
-!                         a(0,n2-j2) = -I(n1/2,j2) = I(n1/2,n2-j2), 
-!                             0<j2<n2/2, 
-!                         a(0,0) = R(0,0), 
-!                         a(1,0) = R(n1/2,0), 
-!                         a(0,n2/2) = R(0,n2/2), 
-!                         a(1,n2/2) = R(n1/2,n2/2)
-!                 ---- output ordering ----
-!                     call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
-!                     call rdft2dsort(n1max, n1, n2, 1, a)
-!                     ! stored data is a(0:n1-1,0:n2+1):
-!                     ! a(2*k1,k2) = R(k1,k2), 
-!                     ! a(2*k1+1,k2) = I(k1,k2), 
-!                     ! 0<=k1<=n1/2, 0<=k2<n2.
-!                     ! the stored data is larger than the input data!
-!                 ---- input ordering ----
-!                     call rdft2dsort(n1max, n1, n2, -1, a)
-!                     call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
-!         t(0:8*n2-1)
-!                :work area (real*8)
-!                 length of t >= 8*n2
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1/4, n2/2) + n1/4
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
-!         is 
-!             call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, n1 - 1
-!                     a(j1, j2) = a(j1, j2) * (2.0d0 / n1 / n2)
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
-!     [definition]
-!         <case1> IDCT (excluding scale)
-!             C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            cos(pi*j1*(k1+1/2)/n1) * 
-!                            cos(pi*j2*(k2+1/2)/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         <case2> DCT
-!             C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            cos(pi*(j1+1/2)*k1/n1) * 
-!                            cos(pi*(j2+1/2)*k2/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row size of the 2D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         a(0:n1-1,0:n2-1)
-!                :input/output data (real*8)
-!                 output data
-!                     a(k1,k2) = C(k1,k2), 0<=k1<n1, 0<=k2<n2
-!         t(0:4*n2-1)
-!                :work area (real*8)
-!                 length of t >= 4*n2
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2/2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1*3/2, n2*3/2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
-!         is 
-!             do j1 = 0, n1 - 1
-!                 a(j1, 0) = a(j1, 0) * 0.5d0
-!             end do
-!             do j2 = 0, n2 - 1
-!                 a(0, j2) = a(0, j2) * 0.5d0
-!             end do
-!             call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, n1 - 1
-!                     a(j1, j2) = a(j1, j2) * (4.0d0 / n1 / n2)
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- DST (Discrete Sine Transform) / Inverse of DST --------
-!     [definition]
-!         <case1> IDST (excluding scale)
-!             S(k1,k2) = sum_j1=1^n1 sum_j2=1^n2 A(j1,j2) * 
-!                            sin(pi*j1*(k1+1/2)/n1) * 
-!                            sin(pi*j2*(k2+1/2)/n2), 
-!                            0<=k1<n1, 0<=k2<n2
-!         <case2> DST
-!             S(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
-!                            sin(pi*(j1+1/2)*k1/n1) * 
-!                            sin(pi*(j2+1/2)*k2/n2), 
-!                            0<k1<=n1, 0<k2<=n2
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row size of the 2D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         a(0:n1-1,0:n2-1)
-!                :input/output data (real*8)
-!                 <case1>
-!                     input data
-!                         a(j1,j2) = A(j1,j2), 0<j1<n1, 0<j2<n2, 
-!                         a(j1,0) = A(j1,n2), 0<j1<n1, 
-!                         a(0,j2) = A(n1,j2), 0<j2<n2, 
-!                         a(0,0) = A(n1,n2)
-!                         (i.e. A(j1,j2) = a(mod(j1,n1),mod(j2,n2)))
-!                     output data
-!                         a(k1,k2) = S(k1,k2), 0<=k1<n1, 0<=k2<n2
-!                 <case2>
-!                     output data
-!                         a(k1,k2) = S(k1,k2), 0<k1<n1, 0<k2<n2, 
-!                         a(k1,0) = S(k1,n2), 0<k1<n1, 
-!                         a(0,k2) = S(n1,k2), 0<k2<n2, 
-!                         a(0,0) = S(n1,n2)
-!                         (i.e. S(k1,k2) = a(mod(k1,n1),mod(k2,n2)))
-!         t(0:4*n2-1)
-!                :work area (real*8)
-!                 length of t >= 4*n2
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2/2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1*3/2, n2*3/2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
-!         is 
-!             do j1 = 0, n1 - 1
-!                 a(j1, 0) = a(j1, 0) * 0.5d0
-!             end do
-!             do j2 = 0, n2 - 1
-!                 a(0, j2) = a(0, j2) * 0.5d0
-!             end do
-!             call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, n1 - 1
-!                     a(j1, j2) = a(j1, j2) * (4.0d0 / n1 / n2)
-!                 end do
-!             end do
-!         .
-!
-!
-      subroutine cdft2d(n1max, n1, n2, isgn, a, t, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), n, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1), 
-     &    w(0 : *)
-      n = max(n1, 2 * n2)
-      if (n .gt. 4 * ip(0)) then
-          call makewt(n / 4, ip, w)
-      end if
-      do j = 0, n2 - 1
-          call cdft(n1, isgn, a(0, j), ip, w)
-      end do
-      call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
-      end
-!
-      subroutine rdft2d(n1max, n1, n2, isgn, a, t, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1), 
-     &    w(0 : *)
-      n = max(n1, 2 * n2)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n1 .gt. 4 * nc) then
-          nc = n1 / 4
-          call makect(nc, ip, w(nw))
-      end if
-      if (isgn .lt. 0) then
-          call rdft2d_sub(n1max, n1, n2, isgn, a)
-          call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
-      end if
-      do j = 0, n2 - 1
-          call rdft(n1, isgn, a(0, j), ip, w)
-      end do
-      if (isgn .ge. 0) then
-          call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
-          call rdft2d_sub(n1max, n1, n2, isgn, a)
-      end if
-      end
-!
-      subroutine rdft2dsort(n1max, n1, n2, isgn, a)
-      integer n1max, n1, n2, isgn, n2h, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), x, y
-      n2h = n2 / 2
-      if (isgn .lt. 0) then
-          do j = n2h + 1, n2 - 1
-              a(0, j) = a(n1 + 1, j)
-              a(1, j) = a(n1, j)
-          end do
-          a(1, 0) = a(n1, 0)
-          a(1, n2h) = a(n1, n2h)
-      else
-          do j = n2h + 1, n2 - 1
-              y = a(0, j)
-              x = a(1, j)
-              a(n1, j) = x
-              a(n1 + 1, j) = y
-              a(n1, n2 - j) = x
-              a(n1 + 1, n2 - j) = -y
-              a(0, j) = a(0, n2 - j)
-              a(1, j) = -a(1, n2 - j)
-          end do
-          a(n1, 0) = a(1, 0)
-          a(n1 + 1, 0) = 0
-          a(1, 0) = 0
-          a(n1, n2h) = a(1, n2h)
-          a(n1 + 1, n2h) = 0
-          a(1, n2h) = 0
-      end if
-      end
-!
-      subroutine ddct2d(n1max, n1, n2, isgn, a, t, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1), 
-     &    w(0 : *)
-      n = max(n1, n2)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. nc) then
-          nc = n
-          call makect(nc, ip, w(nw))
-      end if
-      do j = 0, n2 - 1
-          call ddct(n1, isgn, a(0, j), ip, w)
-      end do
-      call ddxt2d_sub(n1max, n1, n2, 0, isgn, a, t, ip, w)
-      end
-!
-      subroutine ddst2d(n1max, n1, n2, isgn, a, t, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1), 
-     &    w(0 : *)
-      n = max(n1, n2)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. nc) then
-          nc = n
-          call makect(nc, ip, w(nw))
-      end if
-      do j = 0, n2 - 1
-          call ddst(n1, isgn, a(0, j), ip, w)
-      end do
-      call ddxt2d_sub(n1max, n1, n2, 1, isgn, a, t, ip, w)
-      end
-!
-! -------- child routines --------
-!
-      subroutine cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
-      integer n1max, n1, n2, isgn, ip(0 : *), i, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1), 
-     &    w(0 : *)
-      if (n1 .gt. 4) then
-          do i = 0, n1 - 8, 8
-              do j = 0, n2 - 1
-                  t(2 * j) = a(i, j)
-                  t(2 * j + 1) = a(i + 1, j)
-                  t(2 * n2 + 2 * j) = a(i + 2, j)
-                  t(2 * n2 + 2 * j + 1) = a(i + 3, j)
-                  t(4 * n2 + 2 * j) = a(i + 4, j)
-                  t(4 * n2 + 2 * j + 1) = a(i + 5, j)
-                  t(6 * n2 + 2 * j) = a(i + 6, j)
-                  t(6 * n2 + 2 * j + 1) = a(i + 7, j)
-              end do
-              call cdft(2 * n2, isgn, t, ip, w)
-              call cdft(2 * n2, isgn, t(2 * n2), ip, w)
-              call cdft(2 * n2, isgn, t(4 * n2), ip, w)
-              call cdft(2 * n2, isgn, t(6 * n2), ip, w)
-              do j = 0, n2 - 1
-                  a(i, j) = t(2 * j)
-                  a(i + 1, j) = t(2 * j + 1)
-                  a(i + 2, j) = t(2 * n2 + 2 * j)
-                  a(i + 3, j) = t(2 * n2 + 2 * j + 1)
-                  a(i + 4, j) = t(4 * n2 + 2 * j)
-                  a(i + 5, j) = t(4 * n2 + 2 * j + 1)
-                  a(i + 6, j) = t(6 * n2 + 2 * j)
-                  a(i + 7, j) = t(6 * n2 + 2 * j + 1)
-              end do
-          end do
-      else if (n1 .eq. 4) then
-          do j = 0, n2 - 1
-              t(2 * j) = a(0, j)
-              t(2 * j + 1) = a(1, j)
-              t(2 * n2 + 2 * j) = a(2, j)
-              t(2 * n2 + 2 * j + 1) = a(3, j)
-          end do
-          call cdft(2 * n2, isgn, t, ip, w)
-          call cdft(2 * n2, isgn, t(2 * n2), ip, w)
-          do j = 0, n2 - 1
-              a(0, j) = t(2 * j)
-              a(1, j) = t(2 * j + 1)
-              a(2, j) = t(2 * n2 + 2 * j)
-              a(3, j) = t(2 * n2 + 2 * j + 1)
-          end do
-      else if (n1 .eq. 2) then
-          do j = 0, n2 - 1
-              t(2 * j) = a(0, j)
-              t(2 * j + 1) = a(1, j)
-          end do
-          call cdft(2 * n2, isgn, t, ip, w)
-          do j = 0, n2 - 1
-              a(0, j) = t(2 * j)
-              a(1, j) = t(2 * j + 1)
-          end do
-      end if
-      end
-!
-      subroutine rdft2d_sub(n1max, n1, n2, isgn, a)
-      integer n1max, n1, n2, isgn, n2h, i, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), xi
-      n2h = n2 / 2
-      if (isgn .lt. 0) then
-          do i = 1, n2h - 1
-              j = n2 - i
-              xi = a(0, i) - a(0, j)
-              a(0, i) = a(0, i) + a(0, j)
-              a(0, j) = xi
-              xi = a(1, j) - a(1, i)
-              a(1, i) = a(1, i) + a(1, j)
-              a(1, j) = xi
-          end do
-      else
-          do i = 1, n2h - 1
-              j = n2 - i
-              a(0, j) = 0.5d0 * (a(0, i) - a(0, j))
-              a(0, i) = a(0, i) - a(0, j)
-              a(1, j) = 0.5d0 * (a(1, i) + a(1, j))
-              a(1, i) = a(1, i) - a(1, j)
-          end do
-      end if
-      end
-!
-      subroutine ddxt2d_sub(n1max, n1, n2, ics, isgn, a, t, 
-     &    ip, w)
-      integer n1max, n1, n2, ics, isgn, ip(0 : *), i, j
-      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1), 
-     &    w(0 : *)
-      if (n1 .gt. 2) then
-          do i = 0, n1 - 4, 4
-              do j = 0, n2 - 1
-                  t(j) = a(i, j)
-                  t(n2 + j) = a(i + 1, j)
-                  t(2 * n2 + j) = a(i + 2, j)
-                  t(3 * n2 + j) = a(i + 3, j)
-              end do
-              if (ics .eq. 0) then
-                  call ddct(n2, isgn, t, ip, w)
-                  call ddct(n2, isgn, t(n2), ip, w)
-                  call ddct(n2, isgn, t(2 * n2), ip, w)
-                  call ddct(n2, isgn, t(3 * n2), ip, w)
-              else
-                  call ddst(n2, isgn, t, ip, w)
-                  call ddst(n2, isgn, t(n2), ip, w)
-                  call ddst(n2, isgn, t(2 * n2), ip, w)
-                  call ddst(n2, isgn, t(3 * n2), ip, w)
-              end if
-              do j = 0, n2 - 1
-                  a(i, j) = t(j)
-                  a(i + 1, j) = t(n2 + j)
-                  a(i + 2, j) = t(2 * n2 + j)
-                  a(i + 3, j) = t(3 * n2 + j)
-              end do
-          end do
-      else if (n1 .eq. 2) then
-          do j = 0, n2 - 1
-              t(j) = a(0, j)
-              t(n2 + j) = a(1, j)
-          end do
-          if (ics .eq. 0) then
-              call ddct(n2, isgn, t, ip, w)
-              call ddct(n2, isgn, t(n2), ip, w)
-          else
-              call ddst(n2, isgn, t, ip, w)
-              call ddst(n2, isgn, t(n2), ip, w)
-          end if
-          do j = 0, n2 - 1
-              a(0, j) = t(j)
-              a(1, j) = t(n2 + j)
-          end do
-      end if
-      end
-!
diff --git a/third_party/tensorflow_dependencies/fft2d/fftsg3d.c b/third_party/tensorflow_dependencies/fft2d/fftsg3d.c
deleted file mode 100644
index 62f7daa..0000000
--- a/third_party/tensorflow_dependencies/fft2d/fftsg3d.c
+++ /dev/null
@@ -1,1695 +0,0 @@
-/*
-Fast Fourier/Cosine/Sine Transform
-    dimension   :three
-    data length :power of 2
-    decimation  :frequency
-    radix       :split-radix, row-column
-    data        :inplace
-    table       :use
-functions
-    cdft3d: Complex Discrete Fourier Transform
-    rdft3d: Real Discrete Fourier Transform
-    ddct3d: Discrete Cosine Transform
-    ddst3d: Discrete Sine Transform
-function prototypes
-    void cdft3d(int, int, int, int, double ***, double *, int *, double *);
-    void rdft3d(int, int, int, int, double ***, double *, int *, double *);
-    void rdft3dsort(int, int, int, int, double ***);
-    void ddct3d(int, int, int, int, double ***, double *, int *, double *);
-    void ddst3d(int, int, int, int, double ***, double *, int *, double *);
-necessary package
-    fftsg.c  : 1D-FFT package
-macro definitions
-    USE_FFT3D_PTHREADS : default=not defined
-        FFT3D_MAX_THREADS     : must be 2^N, default=4
-        FFT3D_THREADS_BEGIN_N : default=65536
-    USE_FFT3D_WINTHREADS : default=not defined
-        FFT3D_MAX_THREADS     : must be 2^N, default=4
-        FFT3D_THREADS_BEGIN_N : default=131072
-
-
--------- Complex DFT (Discrete Fourier Transform) --------
-    [definition]
-        <case1>
-            X[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-                                x[j1][j2][j3] * 
-                                exp(2*pi*i*j1*k1/n1) * 
-                                exp(2*pi*i*j2*k2/n2) * 
-                                exp(2*pi*i*j3*k3/n3), 
-                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-        <case2>
-            X[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-                                x[j1][j2][j3] * 
-                                exp(-2*pi*i*j1*k1/n1) * 
-                                exp(-2*pi*i*j2*k2/n2) * 
-                                exp(-2*pi*i*j3*k3/n3), 
-                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            cdft3d(n1, n2, 2*n3, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            cdft3d(n1, n2, 2*n3, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 1, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 1, n2 = power of 2
-        2*n3   :data length (int)
-                n3 >= 1, n3 = power of 2
-        a[0...n1-1][0...n2-1][0...2*n3-1]
-               :input/output data (double ***)
-                input data
-                    a[j1][j2][2*j3] = Re(x[j1][j2][j3]), 
-                    a[j1][j2][2*j3+1] = Im(x[j1][j2][j3]), 
-                    0<=j1<n1, 0<=j2<n2, 0<=j3<n3
-                output data
-                    a[k1][k2][2*k3] = Re(X[k1][k2][k3]), 
-                    a[k1][k2][2*k3+1] = Im(X[k1][k2][k3]), 
-                    0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-        t[0...*]
-               :work area (double *)
-                length of t >= max(8*n1, 8*n2),        if single thread, 
-                length of t >= max(8*n1, 8*n2)*FFT3D_MAX_THREADS, 
-                                                       if multi threads, 
-                t is dynamically allocated, if t == NULL.
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1, n2, n3))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1/2, n2/2, n3/2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            cdft3d(n1, n2, 2*n3, -1, a, t, ip, w);
-        is 
-            cdft3d(n1, n2, 2*n3, 1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    for (j3 = 0; j3 <= 2 * n3 - 1; j3++) {
-                        a[j1][j2][j3] *= 1.0 / n1 / n2 / n3;
-                    }
-                }
-            }
-        .
-
-
--------- Real DFT / Inverse of Real DFT --------
-    [definition]
-        <case1> RDFT
-            R[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-                                a[j1][j2][j3] * 
-                                cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
-                                    2*pi*j3*k3/n3), 
-                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-            I[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-                                a[j1][j2][j3] * 
-                                sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
-                                    2*pi*j3*k3/n3), 
-                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-        <case2> IRDFT (excluding scale)
-            a[k1][k2][k3] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-                                (R[j1][j2][j3] * 
-                                cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
-                                    2*pi*j3*k3/n3) + 
-                                I[j1][j2][j3] * 
-                                sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
-                                    2*pi*j3*k3/n3)), 
-                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-        (notes: R[(n1-k1)%n1][(n2-k2)%n2][(n3-k3)%n3] = R[k1][k2][k3], 
-                I[(n1-k1)%n1][(n2-k2)%n2][(n3-k3)%n3] = -I[k1][k2][k3], 
-                0<=k1<n1, 0<=k2<n2, 0<=k3<n3)
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            rdft3d(n1, n2, n3, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            rdft3d(n1, n2, n3, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        n3     :data length (int)
-                n3 >= 2, n3 = power of 2
-        a[0...n1-1][0...n2-1][0...n3-1]
-               :input/output data (double ***)
-                <case1>
-                    output data
-                        a[k1][k2][2*k3] = R[k1][k2][k3]
-                                        = R[(n1-k1)%n1][(n2-k2)%n2][n3-k3], 
-                        a[k1][k2][2*k3+1] = I[k1][k2][k3]
-                                          = -I[(n1-k1)%n1][(n2-k2)%n2][n3-k3], 
-                            0<=k1<n1, 0<=k2<n2, 0<k3<n3/2, 
-                            (n%m : n mod m), 
-                        a[k1][k2][0] = R[k1][k2][0]
-                                     = R[(n1-k1)%n1][n2-k2][0], 
-                        a[k1][k2][1] = I[k1][k2][0]
-                                     = -I[(n1-k1)%n1][n2-k2][0], 
-                        a[k1][n2-k2][1] = R[(n1-k1)%n1][k2][n3/2]
-                                        = R[k1][n2-k2][n3/2], 
-                        a[k1][n2-k2][0] = -I[(n1-k1)%n1][k2][n3/2]
-                                        = I[k1][n2-k2][n3/2], 
-                            0<=k1<n1, 0<k2<n2/2, 
-                        a[k1][0][0] = R[k1][0][0]
-                                    = R[n1-k1][0][0], 
-                        a[k1][0][1] = I[k1][0][0]
-                                    = -I[n1-k1][0][0], 
-                        a[k1][n2/2][0] = R[k1][n2/2][0]
-                                       = R[n1-k1][n2/2][0], 
-                        a[k1][n2/2][1] = I[k1][n2/2][0]
-                                       = -I[n1-k1][n2/2][0], 
-                        a[n1-k1][0][1] = R[k1][0][n3/2]
-                                       = R[n1-k1][0][n3/2], 
-                        a[n1-k1][0][0] = -I[k1][0][n3/2]
-                                       = I[n1-k1][0][n3/2], 
-                        a[n1-k1][n2/2][1] = R[k1][n2/2][n3/2]
-                                          = R[n1-k1][n2/2][n3/2], 
-                        a[n1-k1][n2/2][0] = -I[k1][n2/2][n3/2]
-                                          = I[n1-k1][n2/2][n3/2], 
-                            0<k1<n1/2, 
-                        a[0][0][0] = R[0][0][0], 
-                        a[0][0][1] = R[0][0][n3/2], 
-                        a[0][n2/2][0] = R[0][n2/2][0], 
-                        a[0][n2/2][1] = R[0][n2/2][n3/2], 
-                        a[n1/2][0][0] = R[n1/2][0][0], 
-                        a[n1/2][0][1] = R[n1/2][0][n3/2], 
-                        a[n1/2][n2/2][0] = R[n1/2][n2/2][0], 
-                        a[n1/2][n2/2][1] = R[n1/2][n2/2][n3/2]
-                <case2>
-                    input data
-                        a[j1][j2][2*j3] = R[j1][j2][j3]
-                                        = R[(n1-j1)%n1][(n2-j2)%n2][n3-j3], 
-                        a[j1][j2][2*j3+1] = I[j1][j2][j3]
-                                          = -I[(n1-j1)%n1][(n2-j2)%n2][n3-j3], 
-                            0<=j1<n1, 0<=j2<n2, 0<j3<n3/2, 
-                        a[j1][j2][0] = R[j1][j2][0]
-                                     = R[(n1-j1)%n1][n2-j2][0], 
-                        a[j1][j2][1] = I[j1][j2][0]
-                                     = -I[(n1-j1)%n1][n2-j2][0], 
-                        a[j1][n2-j2][1] = R[(n1-j1)%n1][j2][n3/2]
-                                        = R[j1][n2-j2][n3/2], 
-                        a[j1][n2-j2][0] = -I[(n1-j1)%n1][j2][n3/2]
-                                        = I[j1][n2-j2][n3/2], 
-                            0<=j1<n1, 0<j2<n2/2, 
-                        a[j1][0][0] = R[j1][0][0]
-                                    = R[n1-j1][0][0], 
-                        a[j1][0][1] = I[j1][0][0]
-                                    = -I[n1-j1][0][0], 
-                        a[j1][n2/2][0] = R[j1][n2/2][0]
-                                       = R[n1-j1][n2/2][0], 
-                        a[j1][n2/2][1] = I[j1][n2/2][0]
-                                       = -I[n1-j1][n2/2][0], 
-                        a[n1-j1][0][1] = R[j1][0][n3/2]
-                                       = R[n1-j1][0][n3/2], 
-                        a[n1-j1][0][0] = -I[j1][0][n3/2]
-                                       = I[n1-j1][0][n3/2], 
-                        a[n1-j1][n2/2][1] = R[j1][n2/2][n3/2]
-                                          = R[n1-j1][n2/2][n3/2], 
-                        a[n1-j1][n2/2][0] = -I[j1][n2/2][n3/2]
-                                          = I[n1-j1][n2/2][n3/2], 
-                            0<j1<n1/2, 
-                        a[0][0][0] = R[0][0][0], 
-                        a[0][0][1] = R[0][0][n3/2], 
-                        a[0][n2/2][0] = R[0][n2/2][0], 
-                        a[0][n2/2][1] = R[0][n2/2][n3/2], 
-                        a[n1/2][0][0] = R[n1/2][0][0], 
-                        a[n1/2][0][1] = R[n1/2][0][n3/2], 
-                        a[n1/2][n2/2][0] = R[n1/2][n2/2][0], 
-                        a[n1/2][n2/2][1] = R[n1/2][n2/2][n3/2]
-                ---- output ordering ----
-                    rdft3d(n1, n2, n3, 1, a, t, ip, w);
-                    rdft3dsort(n1, n2, n3, 1, a);
-                    // stored data is a[0...n1-1][0...n2-1][0...n3+1]:
-                    // a[k1][k2][2*k3] = R[k1][k2][k3], 
-                    // a[k1][k2][2*k3+1] = I[k1][k2][k3], 
-                    // 0<=k1<n1, 0<=k2<n2, 0<=k3<=n3/2.
-                    // the stored data is larger than the input data!
-                ---- input ordering ----
-                    rdft3dsort(n1, n2, n3, -1, a);
-                    rdft3d(n1, n2, n3, -1, a, t, ip, w);
-        t[0...*]
-               :work area (double *)
-                length of t >= max(8*n1, 8*n2),        if single thread, 
-                length of t >= max(8*n1, 8*n2)*FFT3D_MAX_THREADS, 
-                                                       if multi threads, 
-                t is dynamically allocated, if t == NULL.
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1, n2, n3/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1/2, n2/2, n3/4) + n3/4
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            rdft3d(n1, n2, n3, 1, a, t, ip, w);
-        is 
-            rdft3d(n1, n2, n3, -1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    for (j3 = 0; j3 <= n3 - 1; j3++) {
-                        a[j1][j2][j3] *= 2.0 / n1 / n2 / n3;
-                    }
-                }
-            }
-        .
-
-
--------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
-    [definition]
-        <case1> IDCT (excluding scale)
-            C[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-                                a[j1][j2][j3] * 
-                                cos(pi*j1*(k1+1/2)/n1) * 
-                                cos(pi*j2*(k2+1/2)/n2) * 
-                                cos(pi*j3*(k3+1/2)/n3), 
-                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-        <case2> DCT
-            C[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-                                a[j1][j2][j3] * 
-                                cos(pi*(j1+1/2)*k1/n1) * 
-                                cos(pi*(j2+1/2)*k2/n2) * 
-                                cos(pi*(j3+1/2)*k3/n3), 
-                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            ddct3d(n1, n2, n3, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            ddct3d(n1, n2, n3, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        n3     :data length (int)
-                n3 >= 2, n3 = power of 2
-        a[0...n1-1][0...n2-1][0...n3-1]
-               :input/output data (double ***)
-                output data
-                    a[k1][k2][k3] = C[k1][k2][k3], 
-                        0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-        t[0...*]
-               :work area (double *)
-                length of t >= max(4*n1, 4*n2),        if single thread, 
-                length of t >= max(4*n1, 4*n2)*FFT3D_MAX_THREADS, 
-                                                       if multi threads, 
-                t is dynamically allocated, if t == NULL.
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1/2, n2/2, n3/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1*3/2, n2*3/2, n3*3/2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            ddct3d(n1, n2, n3, -1, a, t, ip, w);
-        is 
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    a[j1][j2][0] *= 0.5;
-                }
-                for (j3 = 0; j3 <= n3 - 1; j3++) {
-                    a[j1][0][j3] *= 0.5;
-                }
-            }
-            for (j2 = 0; j2 <= n2 - 1; j2++) {
-                for (j3 = 0; j3 <= n3 - 1; j3++) {
-                    a[0][j2][j3] *= 0.5;
-                }
-            }
-            ddct3d(n1, n2, n3, 1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    for (j3 = 0; j3 <= n3 - 1; j3++) {
-                        a[j1][j2][j3] *= 8.0 / n1 / n2 / n3;
-                    }
-                }
-            }
-        .
-
-
--------- DST (Discrete Sine Transform) / Inverse of DST --------
-    [definition]
-        <case1> IDST (excluding scale)
-            S[k1][k2][k3] = sum_j1=1^n1 sum_j2=1^n2 sum_j3=1^n3
-                                A[j1][j2][j3] * 
-                                sin(pi*j1*(k1+1/2)/n1) * 
-                                sin(pi*j2*(k2+1/2)/n2) * 
-                                sin(pi*j3*(k3+1/2)/n3), 
-                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-        <case2> DST
-            S[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-                                a[j1][j2][j3] * 
-                                sin(pi*(j1+1/2)*k1/n1) * 
-                                sin(pi*(j2+1/2)*k2/n2) * 
-                                sin(pi*(j3+1/2)*k3/n3), 
-                                0<k1<=n1, 0<k2<=n2, 0<k3<=n3
-    [usage]
-        <case1>
-            ip[0] = 0; // first time only
-            ddst3d(n1, n2, n3, 1, a, t, ip, w);
-        <case2>
-            ip[0] = 0; // first time only
-            ddst3d(n1, n2, n3, -1, a, t, ip, w);
-    [parameters]
-        n1     :data length (int)
-                n1 >= 2, n1 = power of 2
-        n2     :data length (int)
-                n2 >= 2, n2 = power of 2
-        n3     :data length (int)
-                n3 >= 2, n3 = power of 2
-        a[0...n1-1][0...n2-1][0...n3-1]
-               :input/output data (double ***)
-                <case1>
-                    input data
-                        a[j1%n1][j2%n2][j3%n3] = A[j1][j2][j3], 
-                            0<j1<=n1, 0<j2<=n2, 0<j3<=n3, 
-                            (n%m : n mod m)
-                    output data
-                        a[k1][k2][k3] = S[k1][k2][k3], 
-                            0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-                <case2>
-                    output data
-                        a[k1%n1][k2%n2][k3%n3] = S[k1][k2][k3], 
-                            0<k1<=n1, 0<k2<=n2, 0<k3<=n3
-        t[0...*]
-               :work area (double *)
-                length of t >= max(4*n1, 4*n2),        if single thread, 
-                length of t >= max(4*n1, 4*n2)*FFT3D_MAX_THREADS, 
-                                                       if multi threads, 
-                t is dynamically allocated, if t == NULL.
-        ip[0...*]
-               :work area for bit reversal (int *)
-                length of ip >= 2+sqrt(n)
-                (n = max(n1/2, n2/2, n3/2))
-                ip[0],ip[1] are pointers of the cos/sin table.
-        w[0...*]
-               :cos/sin table (double *)
-                length of w >= max(n1*3/2, n2*3/2, n3*3/2)
-                w[],ip[] are initialized if ip[0] == 0.
-    [remark]
-        Inverse of 
-            ddst3d(n1, n2, n3, -1, a, t, ip, w);
-        is 
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    a[j1][j2][0] *= 0.5;
-                }
-                for (j3 = 0; j3 <= n3 - 1; j3++) {
-                    a[j1][0][j3] *= 0.5;
-                }
-            }
-            for (j2 = 0; j2 <= n2 - 1; j2++) {
-                for (j3 = 0; j3 <= n3 - 1; j3++) {
-                    a[0][j2][j3] *= 0.5;
-                }
-            }
-            ddst3d(n1, n2, n3, 1, a, t, ip, w);
-            for (j1 = 0; j1 <= n1 - 1; j1++) {
-                for (j2 = 0; j2 <= n2 - 1; j2++) {
-                    for (j3 = 0; j3 <= n3 - 1; j3++) {
-                        a[j1][j2][j3] *= 8.0 / n1 / n2 / n3;
-                    }
-                }
-            }
-        .
-*/
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#define fft3d_alloc_error_check(p) { \
-    if ((p) == NULL) { \
-        fprintf(stderr, "fft3d memory allocation error\n"); \
-        exit(1); \
-    } \
-}
-
-
-#ifdef USE_FFT3D_PTHREADS
-#define USE_FFT3D_THREADS
-#ifndef FFT3D_MAX_THREADS
-#define FFT3D_MAX_THREADS 4
-#endif
-#ifndef FFT3D_THREADS_BEGIN_N
-#define FFT3D_THREADS_BEGIN_N 65536
-#endif
-#include <pthread.h>
-#define fft3d_thread_t pthread_t
-#define fft3d_thread_create(thp,func,argp) { \
-    if (pthread_create(thp, NULL, func, (void *) (argp)) != 0) { \
-        fprintf(stderr, "fft3d thread error\n"); \
-        exit(1); \
-    } \
-}
-#define fft3d_thread_wait(th) { \
-    if (pthread_join(th, NULL) != 0) { \
-        fprintf(stderr, "fft3d thread error\n"); \
-        exit(1); \
-    } \
-}
-#endif /* USE_FFT3D_PTHREADS */
-
-
-#ifdef USE_FFT3D_WINTHREADS
-#define USE_FFT3D_THREADS
-#ifndef FFT3D_MAX_THREADS
-#define FFT3D_MAX_THREADS 4
-#endif
-#ifndef FFT3D_THREADS_BEGIN_N
-#define FFT3D_THREADS_BEGIN_N 131072
-#endif
-#include <windows.h>
-#define fft3d_thread_t HANDLE
-#define fft3d_thread_create(thp,func,argp) { \
-    DWORD thid; \
-    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) (func), (LPVOID) (argp), 0, &thid); \
-    if (*(thp) == 0) { \
-        fprintf(stderr, "fft3d thread error\n"); \
-        exit(1); \
-    } \
-}
-#define fft3d_thread_wait(th) { \
-    WaitForSingleObject(th, INFINITE); \
-    CloseHandle(th); \
-}
-#endif /* USE_FFT3D_WINTHREADS */
-
-
-void cdft3d(int n1, int n2, int n3, int isgn, double ***a, 
-    double *t, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-    void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a, 
-        double *t, int *ip, double *w);
-#ifdef USE_FFT3D_THREADS
-    void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-    void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a, 
-        double *t, int *ip, double *w);
-#endif /* USE_FFT3D_THREADS */
-    int n, itnull, nt;
-    
-    n = n1;
-    if (n < n2) {
-        n = n2;
-    }
-    n <<= 1;
-    if (n < n3) {
-        n = n3;
-    }
-    if (n > (ip[0] << 2)) {
-        makewt(n >> 2, ip, w);
-    }
-    itnull = 0;
-    if (t == NULL) {
-        itnull = 1;
-        nt = n1;
-        if (nt < n2) {
-            nt = n2;
-        }
-        nt *= 8;
-#ifdef USE_FFT3D_THREADS
-        nt *= FFT3D_MAX_THREADS;
-#endif /* USE_FFT3D_THREADS */
-        if (n3 == 4) {
-            nt >>= 1;
-        } else if (n3 < 4) {
-            nt >>= 2;
-        }
-        t = (double *) malloc(sizeof(double) * nt);
-        fft3d_alloc_error_check(t);
-    }
-#ifdef USE_FFT3D_THREADS
-    if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
-        xdft3da_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
-        cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
-    } else 
-#endif /* USE_FFT3D_THREADS */
-    {
-        xdft3da_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
-        cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
-    }
-    if (itnull != 0) {
-        free(t);
-    }
-}
-
-
-void rdft3d(int n1, int n2, int n3, int isgn, double ***a, 
-    double *t, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-    void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a, 
-        double *t, int *ip, double *w);
-    void rdft3d_sub(int n1, int n2, int n3, int isgn, double ***a);
-#ifdef USE_FFT3D_THREADS
-    void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-    void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a, 
-        double *t, int *ip, double *w);
-#endif /* USE_FFT3D_THREADS */
-    int n, nw, nc, itnull, nt;
-    
-    n = n1;
-    if (n < n2) {
-        n = n2;
-    }
-    n <<= 1;
-    if (n < n3) {
-        n = n3;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n3 > (nc << 2)) {
-        nc = n3 >> 2;
-        makect(nc, ip, w + nw);
-    }
-    itnull = 0;
-    if (t == NULL) {
-        itnull = 1;
-        nt = n1;
-        if (nt < n2) {
-            nt = n2;
-        }
-        nt *= 8;
-#ifdef USE_FFT3D_THREADS
-        nt *= FFT3D_MAX_THREADS;
-#endif /* USE_FFT3D_THREADS */
-        if (n3 == 4) {
-            nt >>= 1;
-        } else if (n3 < 4) {
-            nt >>= 2;
-        }
-        t = (double *) malloc(sizeof(double) * nt);
-        fft3d_alloc_error_check(t);
-    }
-#ifdef USE_FFT3D_THREADS
-    if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
-        if (isgn < 0) {
-            rdft3d_sub(n1, n2, n3, isgn, a);
-            cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
-        }
-        xdft3da_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
-        if (isgn >= 0) {
-            cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
-            rdft3d_sub(n1, n2, n3, isgn, a);
-        }
-    } else 
-#endif /* USE_FFT3D_THREADS */
-    {
-        if (isgn < 0) {
-            rdft3d_sub(n1, n2, n3, isgn, a);
-            cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
-        }
-        xdft3da_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
-        if (isgn >= 0) {
-            cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
-            rdft3d_sub(n1, n2, n3, isgn, a);
-        }
-    }
-    if (itnull != 0) {
-        free(t);
-    }
-}
-
-
-void rdft3dsort(int n1, int n2, int n3, int isgn, double ***a)
-{
-    int n1h, n2h, i, j;
-    double x, y;
-    
-    n1h = n1 >> 1;
-    n2h = n2 >> 1;
-    if (isgn < 0) {
-        for (i = 0; i < n1; i++) {
-            for (j = n2h + 1; j < n2; j++) {
-                a[i][j][0] = a[i][j][n3 + 1];
-                a[i][j][1] = a[i][j][n3];
-            }
-        }
-        for (i = n1h + 1; i < n1; i++) {
-            a[i][0][0] = a[i][0][n3 + 1];
-            a[i][0][1] = a[i][0][n3];
-            a[i][n2h][0] = a[i][n2h][n3 + 1];
-            a[i][n2h][1] = a[i][n2h][n3];
-        }
-        a[0][0][1] = a[0][0][n3];
-        a[0][n2h][1] = a[0][n2h][n3];
-        a[n1h][0][1] = a[n1h][0][n3];
-        a[n1h][n2h][1] = a[n1h][n2h][n3];
-    } else {
-        for (j = n2h + 1; j < n2; j++) {
-            y = a[0][j][0];
-            x = a[0][j][1];
-            a[0][j][n3] = x;
-            a[0][j][n3 + 1] = y;
-            a[0][n2 - j][n3] = x;
-            a[0][n2 - j][n3 + 1] = -y;
-            a[0][j][0] = a[0][n2 - j][0];
-            a[0][j][1] = -a[0][n2 - j][1];
-        }
-        for (i = 1; i < n1; i++) {
-            for (j = n2h + 1; j < n2; j++) {
-                y = a[i][j][0];
-                x = a[i][j][1];
-                a[i][j][n3] = x;
-                a[i][j][n3 + 1] = y;
-                a[n1 - i][n2 - j][n3] = x;
-                a[n1 - i][n2 - j][n3 + 1] = -y;
-                a[i][j][0] = a[n1 - i][n2 - j][0];
-                a[i][j][1] = -a[n1 - i][n2 - j][1];
-            }
-        }
-        for (i = n1h + 1; i < n1; i++) {
-            y = a[i][0][0];
-            x = a[i][0][1];
-            a[i][0][n3] = x;
-            a[i][0][n3 + 1] = y;
-            a[n1 - i][0][n3] = x;
-            a[n1 - i][0][n3 + 1] = -y;
-            a[i][0][0] = a[n1 - i][0][0];
-            a[i][0][1] = -a[n1 - i][0][1];
-            y = a[i][n2h][0];
-            x = a[i][n2h][1];
-            a[i][n2h][n3] = x;
-            a[i][n2h][n3 + 1] = y;
-            a[n1 - i][n2h][n3] = x;
-            a[n1 - i][n2h][n3 + 1] = -y;
-            a[i][n2h][0] = a[n1 - i][n2h][0];
-            a[i][n2h][1] = -a[n1 - i][n2h][1];
-        }
-        a[0][0][n3] = a[0][0][1];
-        a[0][0][n3 + 1] = 0;
-        a[0][0][1] = 0;
-        a[0][n2h][n3] = a[0][n2h][1];
-        a[0][n2h][n3 + 1] = 0;
-        a[0][n2h][1] = 0;
-        a[n1h][0][n3] = a[n1h][0][1];
-        a[n1h][0][n3 + 1] = 0;
-        a[n1h][0][1] = 0;
-        a[n1h][n2h][n3] = a[n1h][n2h][1];
-        a[n1h][n2h][n3 + 1] = 0;
-        a[n1h][n2h][1] = 0;
-    }
-}
-
-
-void ddct3d(int n1, int n2, int n3, int isgn, double ***a, 
-    double *t, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-    void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-#ifdef USE_FFT3D_THREADS
-    void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-    void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-#endif /* USE_FFT3D_THREADS */
-    int n, nw, nc, itnull, nt;
-    
-    n = n1;
-    if (n < n2) {
-        n = n2;
-    }
-    if (n < n3) {
-        n = n3;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > nc) {
-        nc = n;
-        makect(nc, ip, w + nw);
-    }
-    itnull = 0;
-    if (t == NULL) {
-        itnull = 1;
-        nt = n1;
-        if (nt < n2) {
-            nt = n2;
-        }
-        nt *= 4;
-#ifdef USE_FFT3D_THREADS
-        nt *= FFT3D_MAX_THREADS;
-#endif /* USE_FFT3D_THREADS */
-        if (n3 == 2) {
-            nt >>= 1;
-        }
-        t = (double *) malloc(sizeof(double) * nt);
-        fft3d_alloc_error_check(t);
-    }
-#ifdef USE_FFT3D_THREADS
-    if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
-        ddxt3da_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
-        ddxt3db_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
-    } else 
-#endif /* USE_FFT3D_THREADS */
-    {
-        ddxt3da_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
-        ddxt3db_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
-    }
-    if (itnull != 0) {
-        free(t);
-    }
-}
-
-
-void ddst3d(int n1, int n2, int n3, int isgn, double ***a, 
-    double *t, int *ip, double *w)
-{
-    void makewt(int nw, int *ip, double *w);
-    void makect(int nc, int *ip, double *c);
-    void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-    void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-#ifdef USE_FFT3D_THREADS
-    void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-    void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn, 
-        double ***a, double *t, int *ip, double *w);
-#endif /* USE_FFT3D_THREADS */
-    int n, nw, nc, itnull, nt;
-    
-    n = n1;
-    if (n < n2) {
-        n = n2;
-    }
-    if (n < n3) {
-        n = n3;
-    }
-    nw = ip[0];
-    if (n > (nw << 2)) {
-        nw = n >> 2;
-        makewt(nw, ip, w);
-    }
-    nc = ip[1];
-    if (n > nc) {
-        nc = n;
-        makect(nc, ip, w + nw);
-    }
-    itnull = 0;
-    if (t == NULL) {
-        itnull = 1;
-        nt = n1;
-        if (nt < n2) {
-            nt = n2;
-        }
-        nt *= 4;
-#ifdef USE_FFT3D_THREADS
-        nt *= FFT3D_MAX_THREADS;
-#endif /* USE_FFT3D_THREADS */
-        if (n3 == 2) {
-            nt >>= 1;
-        }
-        t = (double *) malloc(sizeof(double) * nt);
-        fft3d_alloc_error_check(t);
-    }
-#ifdef USE_FFT3D_THREADS
-    if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
-        ddxt3da_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
-        ddxt3db_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
-    } else 
-#endif /* USE_FFT3D_THREADS */
-    {
-        ddxt3da_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
-        ddxt3db_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
-    }
-    if (itnull != 0) {
-        free(t);
-    }
-}
-
-
-/* -------- child routines -------- */
-
-
-void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn, 
-    double ***a, double *t, int *ip, double *w)
-{
-    void cdft(int n, int isgn, double *a, int *ip, double *w);
-    void rdft(int n, int isgn, double *a, int *ip, double *w);
-    int i, j, k;
-    
-    for (i = 0; i < n1; i++) {
-        if (icr == 0) {
-            for (j = 0; j < n2; j++) {
-                cdft(n3, isgn, a[i][j], ip, w);
-            }
-        } else if (isgn >= 0) {
-            for (j = 0; j < n2; j++) {
-                rdft(n3, isgn, a[i][j], ip, w);
-            }
-        }
-        if (n3 > 4) {
-            for (k = 0; k < n3; k += 8) {
-                for (j = 0; j < n2; j++) {
-                    t[2 * j] = a[i][j][k];
-                    t[2 * j + 1] = a[i][j][k + 1];
-                    t[2 * n2 + 2 * j] = a[i][j][k + 2];
-                    t[2 * n2 + 2 * j + 1] = a[i][j][k + 3];
-                    t[4 * n2 + 2 * j] = a[i][j][k + 4];
-                    t[4 * n2 + 2 * j + 1] = a[i][j][k + 5];
-                    t[6 * n2 + 2 * j] = a[i][j][k + 6];
-                    t[6 * n2 + 2 * j + 1] = a[i][j][k + 7];
-                }
-                cdft(2 * n2, isgn, t, ip, w);
-                cdft(2 * n2, isgn, &t[2 * n2], ip, w);
-                cdft(2 * n2, isgn, &t[4 * n2], ip, w);
-                cdft(2 * n2, isgn, &t[6 * n2], ip, w);
-                for (j = 0; j < n2; j++) {
-                    a[i][j][k] = t[2 * j];
-                    a[i][j][k + 1] = t[2 * j + 1];
-                    a[i][j][k + 2] = t[2 * n2 + 2 * j];
-                    a[i][j][k + 3] = t[2 * n2 + 2 * j + 1];
-                    a[i][j][k + 4] = t[4 * n2 + 2 * j];
-                    a[i][j][k + 5] = t[4 * n2 + 2 * j + 1];
-                    a[i][j][k + 6] = t[6 * n2 + 2 * j];
-                    a[i][j][k + 7] = t[6 * n2 + 2 * j + 1];
-                }
-            }
-        } else if (n3 == 4) {
-            for (j = 0; j < n2; j++) {
-                t[2 * j] = a[i][j][0];
-                t[2 * j + 1] = a[i][j][1];
-                t[2 * n2 + 2 * j] = a[i][j][2];
-                t[2 * n2 + 2 * j + 1] = a[i][j][3];
-            }
-            cdft(2 * n2, isgn, t, ip, w);
-            cdft(2 * n2, isgn, &t[2 * n2], ip, w);
-            for (j = 0; j < n2; j++) {
-                a[i][j][0] = t[2 * j];
-                a[i][j][1] = t[2 * j + 1];
-                a[i][j][2] = t[2 * n2 + 2 * j];
-                a[i][j][3] = t[2 * n2 + 2 * j + 1];
-            }
-        } else if (n3 == 2) {
-            for (j = 0; j < n2; j++) {
-                t[2 * j] = a[i][j][0];
-                t[2 * j + 1] = a[i][j][1];
-            }
-            cdft(2 * n2, isgn, t, ip, w);
-            for (j = 0; j < n2; j++) {
-                a[i][j][0] = t[2 * j];
-                a[i][j][1] = t[2 * j + 1];
-            }
-        }
-        if (icr != 0 && isgn < 0) {
-            for (j = 0; j < n2; j++) {
-                rdft(n3, isgn, a[i][j], ip, w);
-            }
-        }
-    }
-}
-
-
-void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a, 
-    double *t, int *ip, double *w)
-{
-    void cdft(int n, int isgn, double *a, int *ip, double *w);
-    int i, j, k;
-    
-    if (n3 > 4) {
-        for (j = 0; j < n2; j++) {
-            for (k = 0; k < n3; k += 8) {
-                for (i = 0; i < n1; i++) {
-                    t[2 * i] = a[i][j][k];
-                    t[2 * i + 1] = a[i][j][k + 1];
-                    t[2 * n1 + 2 * i] = a[i][j][k + 2];
-                    t[2 * n1 + 2 * i + 1] = a[i][j][k + 3];
-                    t[4 * n1 + 2 * i] = a[i][j][k + 4];
-                    t[4 * n1 + 2 * i + 1] = a[i][j][k + 5];
-                    t[6 * n1 + 2 * i] = a[i][j][k + 6];
-                    t[6 * n1 + 2 * i + 1] = a[i][j][k + 7];
-                }
-                cdft(2 * n1, isgn, t, ip, w);
-                cdft(2 * n1, isgn, &t[2 * n1], ip, w);
-                cdft(2 * n1, isgn, &t[4 * n1], ip, w);
-                cdft(2 * n1, isgn, &t[6 * n1], ip, w);
-                for (i = 0; i < n1; i++) {
-                    a[i][j][k] = t[2 * i];
-                    a[i][j][k + 1] = t[2 * i + 1];
-                    a[i][j][k + 2] = t[2 * n1 + 2 * i];
-                    a[i][j][k + 3] = t[2 * n1 + 2 * i + 1];
-                    a[i][j][k + 4] = t[4 * n1 + 2 * i];
-                    a[i][j][k + 5] = t[4 * n1 + 2 * i + 1];
-                    a[i][j][k + 6] = t[6 * n1 + 2 * i];
-                    a[i][j][k + 7] = t[6 * n1 + 2 * i + 1];
-                }
-            }
-        }
-    } else if (n3 == 4) {
-        for (j = 0; j < n2; j++) {
-            for (i = 0; i < n1; i++) {
-                t[2 * i] = a[i][j][0];
-                t[2 * i + 1] = a[i][j][1];
-                t[2 * n1 + 2 * i] = a[i][j][2];
-                t[2 * n1 + 2 * i + 1] = a[i][j][3];
-            }
-            cdft(2 * n1, isgn, t, ip, w);
-            cdft(2 * n1, isgn, &t[2 * n1], ip, w);
-            for (i = 0; i < n1; i++) {
-                a[i][j][0] = t[2 * i];
-                a[i][j][1] = t[2 * i + 1];
-                a[i][j][2] = t[2 * n1 + 2 * i];
-                a[i][j][3] = t[2 * n1 + 2 * i + 1];
-            }
-        }
-    } else if (n3 == 2) {
-        for (j = 0; j < n2; j++) {
-            for (i = 0; i < n1; i++) {
-                t[2 * i] = a[i][j][0];
-                t[2 * i + 1] = a[i][j][1];
-            }
-            cdft(2 * n1, isgn, t, ip, w);
-            for (i = 0; i < n1; i++) {
-                a[i][j][0] = t[2 * i];
-                a[i][j][1] = t[2 * i + 1];
-            }
-        }
-    }
-}
-
-
-void rdft3d_sub(int n1, int n2, int n3, int isgn, double ***a)
-{
-    int n1h, n2h, i, j, k, l;
-    double xi;
-    
-    n1h = n1 >> 1;
-    n2h = n2 >> 1;
-    if (isgn < 0) {
-        for (i = 1; i < n1h; i++) {
-            j = n1 - i;
-            xi = a[i][0][0] - a[j][0][0];
-            a[i][0][0] += a[j][0][0];
-            a[j][0][0] = xi;
-            xi = a[j][0][1] - a[i][0][1];
-            a[i][0][1] += a[j][0][1];
-            a[j][0][1] = xi;
-            xi = a[i][n2h][0] - a[j][n2h][0];
-            a[i][n2h][0] += a[j][n2h][0];
-            a[j][n2h][0] = xi;
-            xi = a[j][n2h][1] - a[i][n2h][1];
-            a[i][n2h][1] += a[j][n2h][1];
-            a[j][n2h][1] = xi;
-            for (k = 1; k < n2h; k++) {
-                l = n2 - k;
-                xi = a[i][k][0] - a[j][l][0];
-                a[i][k][0] += a[j][l][0];
-                a[j][l][0] = xi;
-                xi = a[j][l][1] - a[i][k][1];
-                a[i][k][1] += a[j][l][1];
-                a[j][l][1] = xi;
-                xi = a[j][k][0] - a[i][l][0];
-                a[j][k][0] += a[i][l][0];
-                a[i][l][0] = xi;
-                xi = a[i][l][1] - a[j][k][1];
-                a[j][k][1] += a[i][l][1];
-                a[i][l][1] = xi;
-            }
-        }
-        for (k = 1; k < n2h; k++) {
-            l = n2 - k;
-            xi = a[0][k][0] - a[0][l][0];
-            a[0][k][0] += a[0][l][0];
-            a[0][l][0] = xi;
-            xi = a[0][l][1] - a[0][k][1];
-            a[0][k][1] += a[0][l][1];
-            a[0][l][1] = xi;
-            xi = a[n1h][k][0] - a[n1h][l][0];
-            a[n1h][k][0] += a[n1h][l][0];
-            a[n1h][l][0] = xi;
-            xi = a[n1h][l][1] - a[n1h][k][1];
-            a[n1h][k][1] += a[n1h][l][1];
-            a[n1h][l][1] = xi;
-        }
-    } else {
-        for (i = 1; i < n1h; i++) {
-            j = n1 - i;
-            a[j][0][0] = 0.5 * (a[i][0][0] - a[j][0][0]);
-            a[i][0][0] -= a[j][0][0];
-            a[j][0][1] = 0.5 * (a[i][0][1] + a[j][0][1]);
-            a[i][0][1] -= a[j][0][1];
-            a[j][n2h][0] = 0.5 * (a[i][n2h][0] - a[j][n2h][0]);
-            a[i][n2h][0] -= a[j][n2h][0];
-            a[j][n2h][1] = 0.5 * (a[i][n2h][1] + a[j][n2h][1]);
-            a[i][n2h][1] -= a[j][n2h][1];
-            for (k = 1; k < n2h; k++) {
-                l = n2 - k;
-                a[j][l][0] = 0.5 * (a[i][k][0] - a[j][l][0]);
-                a[i][k][0] -= a[j][l][0];
-                a[j][l][1] = 0.5 * (a[i][k][1] + a[j][l][1]);
-                a[i][k][1] -= a[j][l][1];
-                a[i][l][0] = 0.5 * (a[j][k][0] - a[i][l][0]);
-                a[j][k][0] -= a[i][l][0];
-                a[i][l][1] = 0.5 * (a[j][k][1] + a[i][l][1]);
-                a[j][k][1] -= a[i][l][1];
-            }
-        }
-        for (k = 1; k < n2h; k++) {
-            l = n2 - k;
-            a[0][l][0] = 0.5 * (a[0][k][0] - a[0][l][0]);
-            a[0][k][0] -= a[0][l][0];
-            a[0][l][1] = 0.5 * (a[0][k][1] + a[0][l][1]);
-            a[0][k][1] -= a[0][l][1];
-            a[n1h][l][0] = 0.5 * (a[n1h][k][0] - a[n1h][l][0]);
-            a[n1h][k][0] -= a[n1h][l][0];
-            a[n1h][l][1] = 0.5 * (a[n1h][k][1] + a[n1h][l][1]);
-            a[n1h][k][1] -= a[n1h][l][1];
-        }
-    }
-}
-
-
-void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn, 
-    double ***a, double *t, int *ip, double *w)
-{
-    void ddct(int n, int isgn, double *a, int *ip, double *w);
-    void ddst(int n, int isgn, double *a, int *ip, double *w);
-    int i, j, k;
-    
-    for (i = 0; i < n1; i++) {
-        if (ics == 0) {
-            for (j = 0; j < n2; j++) {
-                ddct(n3, isgn, a[i][j], ip, w);
-            }
-        } else {
-            for (j = 0; j < n2; j++) {
-                ddst(n3, isgn, a[i][j], ip, w);
-            }
-        }
-        if (n3 > 2) {
-            for (k = 0; k < n3; k += 4) {
-                for (j = 0; j < n2; j++) {
-                    t[j] = a[i][j][k];
-                    t[n2 + j] = a[i][j][k + 1];
-                    t[2 * n2 + j] = a[i][j][k + 2];
-                    t[3 * n2 + j] = a[i][j][k + 3];
-                }
-                if (ics == 0) {
-                    ddct(n2, isgn, t, ip, w);
-                    ddct(n2, isgn, &t[n2], ip, w);
-                    ddct(n2, isgn, &t[2 * n2], ip, w);
-                    ddct(n2, isgn, &t[3 * n2], ip, w);
-                } else {
-                    ddst(n2, isgn, t, ip, w);
-                    ddst(n2, isgn, &t[n2], ip, w);
-                    ddst(n2, isgn, &t[2 * n2], ip, w);
-                    ddst(n2, isgn, &t[3 * n2], ip, w);
-                }
-                for (j = 0; j < n2; j++) {
-                    a[i][j][k] = t[j];
-                    a[i][j][k + 1] = t[n2 + j];
-                    a[i][j][k + 2] = t[2 * n2 + j];
-                    a[i][j][k + 3] = t[3 * n2 + j];
-                }
-            }
-        } else if (n3 == 2) {
-            for (j = 0; j < n2; j++) {
-                t[j] = a[i][j][0];
-                t[n2 + j] = a[i][j][1];
-            }
-            if (ics == 0) {
-                ddct(n2, isgn, t, ip, w);
-                ddct(n2, isgn, &t[n2], ip, w);
-            } else {
-                ddst(n2, isgn, t, ip, w);
-                ddst(n2, isgn, &t[n2], ip, w);
-            }
-            for (j = 0; j < n2; j++) {
-                a[i][j][0] = t[j];
-                a[i][j][1] = t[n2 + j];
-            }
-        }
-    }
-}
-
-
-void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn, 
-    double ***a, double *t, int *ip, double *w)
-{
-    void ddct(int n, int isgn, double *a, int *ip, double *w);
-    void ddst(int n, int isgn, double *a, int *ip, double *w);
-    int i, j, k;
-    
-    if (n3 > 2) {
-        for (j = 0; j < n2; j++) {
-            for (k = 0; k < n3; k += 4) {
-                for (i = 0; i < n1; i++) {
-                    t[i] = a[i][j][k];
-                    t[n1 + i] = a[i][j][k + 1];
-                    t[2 * n1 + i] = a[i][j][k + 2];
-                    t[3 * n1 + i] = a[i][j][k + 3];
-                }
-                if (ics == 0) {
-                    ddct(n1, isgn, t, ip, w);
-                    ddct(n1, isgn, &t[n1], ip, w);
-                    ddct(n1, isgn, &t[2 * n1], ip, w);
-                    ddct(n1, isgn, &t[3 * n1], ip, w);
-                } else {
-                    ddst(n1, isgn, t, ip, w);
-                    ddst(n1, isgn, &t[n1], ip, w);
-                    ddst(n1, isgn, &t[2 * n1], ip, w);
-                    ddst(n1, isgn, &t[3 * n1], ip, w);
-                }
-                for (i = 0; i < n1; i++) {
-                    a[i][j][k] = t[i];
-                    a[i][j][k + 1] = t[n1 + i];
-                    a[i][j][k + 2] = t[2 * n1 + i];
-                    a[i][j][k + 3] = t[3 * n1 + i];
-                }
-            }
-        }
-    } else if (n3 == 2) {
-        for (j = 0; j < n2; j++) {
-            for (i = 0; i < n1; i++) {
-                t[i] = a[i][j][0];
-                t[n1 + i] = a[i][j][1];
-            }
-            if (ics == 0) {
-                ddct(n1, isgn, t, ip, w);
-                ddct(n1, isgn, &t[n1], ip, w);
-            } else {
-                ddst(n1, isgn, t, ip, w);
-                ddst(n1, isgn, &t[n1], ip, w);
-            }
-            for (i = 0; i < n1; i++) {
-                a[i][j][0] = t[i];
-                a[i][j][1] = t[n1 + i];
-            }
-        }
-    }
-}
-
-
-#ifdef USE_FFT3D_THREADS
-struct fft3d_arg_st {
-    int nthread;
-    int n0;
-    int n1;
-    int n2;
-    int n3;
-    int ic;
-    int isgn;
-    double ***a;
-    double *t;
-    int *ip;
-    double *w;
-};
-typedef struct fft3d_arg_st fft3d_arg_t;
-
-
-void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn, 
-    double ***a, double *t, int *ip, double *w)
-{
-    void *xdft3da_th(void *p);
-    fft3d_thread_t th[FFT3D_MAX_THREADS];
-    fft3d_arg_t ag[FFT3D_MAX_THREADS];
-    int nthread, nt, i;
-    
-    nthread = FFT3D_MAX_THREADS;
-    if (nthread > n1) {
-        nthread = n1;
-    }
-    nt = 8 * n2;
-    if (n3 == 4) {
-        nt >>= 1;
-    } else if (n3 < 4) {
-        nt >>= 2;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].nthread = nthread;
-        ag[i].n0 = i;
-        ag[i].n1 = n1;
-        ag[i].n2 = n2;
-        ag[i].n3 = n3;
-        ag[i].ic = icr;
-        ag[i].isgn = isgn;
-        ag[i].a = a;
-        ag[i].t = &t[nt * i];
-        ag[i].ip = ip;
-        ag[i].w = w;
-        fft3d_thread_create(&th[i], xdft3da_th, &ag[i]);
-    }
-    for (i = 0; i < nthread; i++) {
-        fft3d_thread_wait(th[i]);
-    }
-}
-
-
-void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a, 
-    double *t, int *ip, double *w)
-{
-    void *cdft3db_th(void *p);
-    fft3d_thread_t th[FFT3D_MAX_THREADS];
-    fft3d_arg_t ag[FFT3D_MAX_THREADS];
-    int nthread, nt, i;
-    
-    nthread = FFT3D_MAX_THREADS;
-    if (nthread > n2) {
-        nthread = n2;
-    }
-    nt = 8 * n1;
-    if (n3 == 4) {
-        nt >>= 1;
-    } else if (n3 < 4) {
-        nt >>= 2;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].nthread = nthread;
-        ag[i].n0 = i;
-        ag[i].n1 = n1;
-        ag[i].n2 = n2;
-        ag[i].n3 = n3;
-        ag[i].isgn = isgn;
-        ag[i].a = a;
-        ag[i].t = &t[nt * i];
-        ag[i].ip = ip;
-        ag[i].w = w;
-        fft3d_thread_create(&th[i], cdft3db_th, &ag[i]);
-    }
-    for (i = 0; i < nthread; i++) {
-        fft3d_thread_wait(th[i]);
-    }
-}
-
-
-void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn, 
-    double ***a, double *t, int *ip, double *w)
-{
-    void *ddxt3da_th(void *p);
-    fft3d_thread_t th[FFT3D_MAX_THREADS];
-    fft3d_arg_t ag[FFT3D_MAX_THREADS];
-    int nthread, nt, i;
-    
-    nthread = FFT3D_MAX_THREADS;
-    if (nthread > n1) {
-        nthread = n1;
-    }
-    nt = 4 * n2;
-    if (n3 == 2) {
-        nt >>= 1;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].nthread = nthread;
-        ag[i].n0 = i;
-        ag[i].n1 = n1;
-        ag[i].n2 = n2;
-        ag[i].n3 = n3;
-        ag[i].ic = ics;
-        ag[i].isgn = isgn;
-        ag[i].a = a;
-        ag[i].t = &t[nt * i];
-        ag[i].ip = ip;
-        ag[i].w = w;
-        fft3d_thread_create(&th[i], ddxt3da_th, &ag[i]);
-    }
-    for (i = 0; i < nthread; i++) {
-        fft3d_thread_wait(th[i]);
-    }
-}
-
-
-void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn, 
-    double ***a, double *t, int *ip, double *w)
-{
-    void *ddxt3db_th(void *p);
-    fft3d_thread_t th[FFT3D_MAX_THREADS];
-    fft3d_arg_t ag[FFT3D_MAX_THREADS];
-    int nthread, nt, i;
-    
-    nthread = FFT3D_MAX_THREADS;
-    if (nthread > n2) {
-        nthread = n2;
-    }
-    nt = 4 * n1;
-    if (n3 == 2) {
-        nt >>= 1;
-    }
-    for (i = 0; i < nthread; i++) {
-        ag[i].nthread = nthread;
-        ag[i].n0 = i;
-        ag[i].n1 = n1;
-        ag[i].n2 = n2;
-        ag[i].n3 = n3;
-        ag[i].ic = ics;
-        ag[i].isgn = isgn;
-        ag[i].a = a;
-        ag[i].t = &t[nt * i];
-        ag[i].ip = ip;
-        ag[i].w = w;
-        fft3d_thread_create(&th[i], ddxt3db_th, &ag[i]);
-    }
-    for (i = 0; i < nthread; i++) {
-        fft3d_thread_wait(th[i]);
-    }
-}
-
-
-void *xdft3da_th(void *p)
-{
-    void cdft(int n, int isgn, double *a, int *ip, double *w);
-    void rdft(int n, int isgn, double *a, int *ip, double *w);
-    int nthread, n0, n1, n2, n3, icr, isgn, *ip, i, j, k;
-    double ***a, *t, *w;
-    
-    nthread = ((fft3d_arg_t *) p)->nthread;
-    n0 = ((fft3d_arg_t *) p)->n0;
-    n1 = ((fft3d_arg_t *) p)->n1;
-    n2 = ((fft3d_arg_t *) p)->n2;
-    n3 = ((fft3d_arg_t *) p)->n3;
-    icr = ((fft3d_arg_t *) p)->ic;
-    isgn = ((fft3d_arg_t *) p)->isgn;
-    a = ((fft3d_arg_t *) p)->a;
-    t = ((fft3d_arg_t *) p)->t;
-    ip = ((fft3d_arg_t *) p)->ip;
-    w = ((fft3d_arg_t *) p)->w;
-    for (i = n0; i < n1; i += nthread) {
-        if (icr == 0) {
-            for (j = 0; j < n2; j++) {
-                cdft(n3, isgn, a[i][j], ip, w);
-            }
-        } else if (isgn >= 0) {
-            for (j = 0; j < n2; j++) {
-                rdft(n3, isgn, a[i][j], ip, w);
-            }
-        }
-        if (n3 > 4) {
-            for (k = 0; k < n3; k += 8) {
-                for (j = 0; j < n2; j++) {
-                    t[2 * j] = a[i][j][k];
-                    t[2 * j + 1] = a[i][j][k + 1];
-                    t[2 * n2 + 2 * j] = a[i][j][k + 2];
-                    t[2 * n2 + 2 * j + 1] = a[i][j][k + 3];
-                    t[4 * n2 + 2 * j] = a[i][j][k + 4];
-                    t[4 * n2 + 2 * j + 1] = a[i][j][k + 5];
-                    t[6 * n2 + 2 * j] = a[i][j][k + 6];
-                    t[6 * n2 + 2 * j + 1] = a[i][j][k + 7];
-                }
-                cdft(2 * n2, isgn, t, ip, w);
-                cdft(2 * n2, isgn, &t[2 * n2], ip, w);
-                cdft(2 * n2, isgn, &t[4 * n2], ip, w);
-                cdft(2 * n2, isgn, &t[6 * n2], ip, w);
-                for (j = 0; j < n2; j++) {
-                    a[i][j][k] = t[2 * j];
-                    a[i][j][k + 1] = t[2 * j + 1];
-                    a[i][j][k + 2] = t[2 * n2 + 2 * j];
-                    a[i][j][k + 3] = t[2 * n2 + 2 * j + 1];
-                    a[i][j][k + 4] = t[4 * n2 + 2 * j];
-                    a[i][j][k + 5] = t[4 * n2 + 2 * j + 1];
-                    a[i][j][k + 6] = t[6 * n2 + 2 * j];
-                    a[i][j][k + 7] = t[6 * n2 + 2 * j + 1];
-                }
-            }
-        } else if (n3 == 4) {
-            for (j = 0; j < n2; j++) {
-                t[2 * j] = a[i][j][0];
-                t[2 * j + 1] = a[i][j][1];
-                t[2 * n2 + 2 * j] = a[i][j][2];
-                t[2 * n2 + 2 * j + 1] = a[i][j][3];
-            }
-            cdft(2 * n2, isgn, t, ip, w);
-            cdft(2 * n2, isgn, &t[2 * n2], ip, w);
-            for (j = 0; j < n2; j++) {
-                a[i][j][0] = t[2 * j];
-                a[i][j][1] = t[2 * j + 1];
-                a[i][j][2] = t[2 * n2 + 2 * j];
-                a[i][j][3] = t[2 * n2 + 2 * j + 1];
-            }
-        } else if (n3 == 2) {
-            for (j = 0; j < n2; j++) {
-                t[2 * j] = a[i][j][0];
-                t[2 * j + 1] = a[i][j][1];
-            }
-            cdft(2 * n2, isgn, t, ip, w);
-            for (j = 0; j < n2; j++) {
-                a[i][j][0] = t[2 * j];
-                a[i][j][1] = t[2 * j + 1];
-            }
-        }
-        if (icr != 0 && isgn < 0) {
-            for (j = 0; j < n2; j++) {
-                rdft(n3, isgn, a[i][j], ip, w);
-            }
-        }
-    }
-    return (void *) 0;
-}
-
-
-void *cdft3db_th(void *p)
-{
-    void cdft(int n, int isgn, double *a, int *ip, double *w);
-    int nthread, n0, n1, n2, n3, isgn, *ip, i, j, k;
-    double ***a, *t, *w;
-    
-    nthread = ((fft3d_arg_t *) p)->nthread;
-    n0 = ((fft3d_arg_t *) p)->n0;
-    n1 = ((fft3d_arg_t *) p)->n1;
-    n2 = ((fft3d_arg_t *) p)->n2;
-    n3 = ((fft3d_arg_t *) p)->n3;
-    isgn = ((fft3d_arg_t *) p)->isgn;
-    a = ((fft3d_arg_t *) p)->a;
-    t = ((fft3d_arg_t *) p)->t;
-    ip = ((fft3d_arg_t *) p)->ip;
-    w = ((fft3d_arg_t *) p)->w;
-    if (n3 > 4) {
-        for (j = n0; j < n2; j += nthread) {
-            for (k = 0; k < n3; k += 8) {
-                for (i = 0; i < n1; i++) {
-                    t[2 * i] = a[i][j][k];
-                    t[2 * i + 1] = a[i][j][k + 1];
-                    t[2 * n1 + 2 * i] = a[i][j][k + 2];
-                    t[2 * n1 + 2 * i + 1] = a[i][j][k + 3];
-                    t[4 * n1 + 2 * i] = a[i][j][k + 4];
-                    t[4 * n1 + 2 * i + 1] = a[i][j][k + 5];
-                    t[6 * n1 + 2 * i] = a[i][j][k + 6];
-                    t[6 * n1 + 2 * i + 1] = a[i][j][k + 7];
-                }
-                cdft(2 * n1, isgn, t, ip, w);
-                cdft(2 * n1, isgn, &t[2 * n1], ip, w);
-                cdft(2 * n1, isgn, &t[4 * n1], ip, w);
-                cdft(2 * n1, isgn, &t[6 * n1], ip, w);
-                for (i = 0; i < n1; i++) {
-                    a[i][j][k] = t[2 * i];
-                    a[i][j][k + 1] = t[2 * i + 1];
-                    a[i][j][k + 2] = t[2 * n1 + 2 * i];
-                    a[i][j][k + 3] = t[2 * n1 + 2 * i + 1];
-                    a[i][j][k + 4] = t[4 * n1 + 2 * i];
-                    a[i][j][k + 5] = t[4 * n1 + 2 * i + 1];
-                    a[i][j][k + 6] = t[6 * n1 + 2 * i];
-                    a[i][j][k + 7] = t[6 * n1 + 2 * i + 1];
-                }
-            }
-        }
-    } else if (n3 == 4) {
-        for (j = n0; j < n2; j += nthread) {
-            for (i = 0; i < n1; i++) {
-                t[2 * i] = a[i][j][0];
-                t[2 * i + 1] = a[i][j][1];
-                t[2 * n1 + 2 * i] = a[i][j][2];
-                t[2 * n1 + 2 * i + 1] = a[i][j][3];
-            }
-            cdft(2 * n1, isgn, t, ip, w);
-            cdft(2 * n1, isgn, &t[2 * n1], ip, w);
-            for (i = 0; i < n1; i++) {
-                a[i][j][0] = t[2 * i];
-                a[i][j][1] = t[2 * i + 1];
-                a[i][j][2] = t[2 * n1 + 2 * i];
-                a[i][j][3] = t[2 * n1 + 2 * i + 1];
-            }
-        }
-    } else if (n3 == 2) {
-        for (j = n0; j < n2; j += nthread) {
-            for (i = 0; i < n1; i++) {
-                t[2 * i] = a[i][j][0];
-                t[2 * i + 1] = a[i][j][1];
-            }
-            cdft(2 * n1, isgn, t, ip, w);
-            for (i = 0; i < n1; i++) {
-                a[i][j][0] = t[2 * i];
-                a[i][j][1] = t[2 * i + 1];
-            }
-        }
-    }
-    return (void *) 0;
-}
-
-
-void *ddxt3da_th(void *p)
-{
-    void ddct(int n, int isgn, double *a, int *ip, double *w);
-    void ddst(int n, int isgn, double *a, int *ip, double *w);
-    int nthread, n0, n1, n2, n3, ics, isgn, *ip, i, j, k;
-    double ***a, *t, *w;
-    
-    nthread = ((fft3d_arg_t *) p)->nthread;
-    n0 = ((fft3d_arg_t *) p)->n0;
-    n1 = ((fft3d_arg_t *) p)->n1;
-    n2 = ((fft3d_arg_t *) p)->n2;
-    n3 = ((fft3d_arg_t *) p)->n3;
-    ics = ((fft3d_arg_t *) p)->ic;
-    isgn = ((fft3d_arg_t *) p)->isgn;
-    a = ((fft3d_arg_t *) p)->a;
-    t = ((fft3d_arg_t *) p)->t;
-    ip = ((fft3d_arg_t *) p)->ip;
-    w = ((fft3d_arg_t *) p)->w;
-    for (i = n0; i < n1; i += nthread) {
-        if (ics == 0) {
-            for (j = 0; j < n2; j++) {
-                ddct(n3, isgn, a[i][j], ip, w);
-            }
-        } else {
-            for (j = 0; j < n2; j++) {
-                ddst(n3, isgn, a[i][j], ip, w);
-            }
-        }
-        if (n3 > 2) {
-            for (k = 0; k < n3; k += 4) {
-                for (j = 0; j < n2; j++) {
-                    t[j] = a[i][j][k];
-                    t[n2 + j] = a[i][j][k + 1];
-                    t[2 * n2 + j] = a[i][j][k + 2];
-                    t[3 * n2 + j] = a[i][j][k + 3];
-                }
-                if (ics == 0) {
-                    ddct(n2, isgn, t, ip, w);
-                    ddct(n2, isgn, &t[n2], ip, w);
-                    ddct(n2, isgn, &t[2 * n2], ip, w);
-                    ddct(n2, isgn, &t[3 * n2], ip, w);
-                } else {
-                    ddst(n2, isgn, t, ip, w);
-                    ddst(n2, isgn, &t[n2], ip, w);
-                    ddst(n2, isgn, &t[2 * n2], ip, w);
-                    ddst(n2, isgn, &t[3 * n2], ip, w);
-                }
-                for (j = 0; j < n2; j++) {
-                    a[i][j][k] = t[j];
-                    a[i][j][k + 1] = t[n2 + j];
-                    a[i][j][k + 2] = t[2 * n2 + j];
-                    a[i][j][k + 3] = t[3 * n2 + j];
-                }
-            }
-        } else if (n3 == 2) {
-            for (j = 0; j < n2; j++) {
-                t[j] = a[i][j][0];
-                t[n2 + j] = a[i][j][1];
-            }
-            if (ics == 0) {
-                ddct(n2, isgn, t, ip, w);
-                ddct(n2, isgn, &t[n2], ip, w);
-            } else {
-                ddst(n2, isgn, t, ip, w);
-                ddst(n2, isgn, &t[n2], ip, w);
-            }
-            for (j = 0; j < n2; j++) {
-                a[i][j][0] = t[j];
-                a[i][j][1] = t[n2 + j];
-            }
-        }
-    }
-    return (void *) 0;
-}
-
-
-void *ddxt3db_th(void *p)
-{
-    void ddct(int n, int isgn, double *a, int *ip, double *w);
-    void ddst(int n, int isgn, double *a, int *ip, double *w);
-    int nthread, n0, n1, n2, n3, ics, isgn, *ip, i, j, k;
-    double ***a, *t, *w;
-    
-    nthread = ((fft3d_arg_t *) p)->nthread;
-    n0 = ((fft3d_arg_t *) p)->n0;
-    n1 = ((fft3d_arg_t *) p)->n1;
-    n2 = ((fft3d_arg_t *) p)->n2;
-    n3 = ((fft3d_arg_t *) p)->n3;
-    ics = ((fft3d_arg_t *) p)->ic;
-    isgn = ((fft3d_arg_t *) p)->isgn;
-    a = ((fft3d_arg_t *) p)->a;
-    t = ((fft3d_arg_t *) p)->t;
-    ip = ((fft3d_arg_t *) p)->ip;
-    w = ((fft3d_arg_t *) p)->w;
-    if (n3 > 2) {
-        for (j = n0; j < n2; j += nthread) {
-            for (k = 0; k < n3; k += 4) {
-                for (i = 0; i < n1; i++) {
-                    t[i] = a[i][j][k];
-                    t[n1 + i] = a[i][j][k + 1];
-                    t[2 * n1 + i] = a[i][j][k + 2];
-                    t[3 * n1 + i] = a[i][j][k + 3];
-                }
-                if (ics == 0) {
-                    ddct(n1, isgn, t, ip, w);
-                    ddct(n1, isgn, &t[n1], ip, w);
-                    ddct(n1, isgn, &t[2 * n1], ip, w);
-                    ddct(n1, isgn, &t[3 * n1], ip, w);
-                } else {
-                    ddst(n1, isgn, t, ip, w);
-                    ddst(n1, isgn, &t[n1], ip, w);
-                    ddst(n1, isgn, &t[2 * n1], ip, w);
-                    ddst(n1, isgn, &t[3 * n1], ip, w);
-                }
-                for (i = 0; i < n1; i++) {
-                    a[i][j][k] = t[i];
-                    a[i][j][k + 1] = t[n1 + i];
-                    a[i][j][k + 2] = t[2 * n1 + i];
-                    a[i][j][k + 3] = t[3 * n1 + i];
-                }
-            }
-        }
-    } else if (n3 == 2) {
-        for (j = n0; j < n2; j += nthread) {
-            for (i = 0; i < n1; i++) {
-                t[i] = a[i][j][0];
-                t[n1 + i] = a[i][j][1];
-            }
-            if (ics == 0) {
-                ddct(n1, isgn, t, ip, w);
-                ddct(n1, isgn, &t[n1], ip, w);
-            } else {
-                ddst(n1, isgn, t, ip, w);
-                ddst(n1, isgn, &t[n1], ip, w);
-            }
-            for (i = 0; i < n1; i++) {
-                a[i][j][0] = t[i];
-                a[i][j][1] = t[n1 + i];
-            }
-        }
-    }
-    return (void *) 0;
-}
-#endif /* USE_FFT3D_THREADS */
-
diff --git a/third_party/tensorflow_dependencies/fft2d/fftsg3d.f b/third_party/tensorflow_dependencies/fft2d/fftsg3d.f
deleted file mode 100644
index f08b1dc..0000000
--- a/third_party/tensorflow_dependencies/fft2d/fftsg3d.f
+++ /dev/null
@@ -1,926 +0,0 @@
-! Fast Fourier/Cosine/Sine Transform
-!     dimension   :three
-!     data length :power of 2
-!     decimation  :frequency
-!     radix       :split-radix, row-column
-!     data        :inplace
-!     table       :use
-! subroutines
-!     cdft3d: Complex Discrete Fourier Transform
-!     rdft3d: Real Discrete Fourier Transform
-!     ddct3d: Discrete Cosine Transform
-!     ddst3d: Discrete Sine Transform
-! necessary package
-!     fftsg.f  : 1D-FFT package
-!
-!
-! -------- Complex DFT (Discrete Fourier Transform) --------
-!     [definition]
-!         <case1>
-!             X(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-!                               x(j1,j2,j3) * 
-!                               exp(2*pi*i*j1*k1/n1) * 
-!                               exp(2*pi*i*j2*k2/n2) * 
-!                               exp(2*pi*i*j3*k3/n3), 
-!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!         <case2>
-!             X(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-!                               x(j1,j2,j3) * 
-!                               exp(-2*pi*i*j1*k1/n1) * 
-!                               exp(-2*pi*i*j2*k2/n2) * 
-!                               exp(-2*pi*i*j3*k3/n3), 
-!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call cdft3d(n1max, n2max, 2*n1, n2, n3, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call cdft3d(n1max, n2max, 2*n1, n2, n3, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row1 size of the 3D array (integer)
-!         n2max  :row2 size of the 3D array (integer)
-!         2*n1   :data length (integer)
-!                 n1 >= 1, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 1, n2 = power of 2
-!         n3     :data length (integer)
-!                 n3 >= 1, n3 = power of 2
-!         a(0:2*n1-1,0:n2-1,0:n3-1)
-!                :input/output data (real*8)
-!                 input data
-!                     a(2*j1,j2,j3) = Re(x(j1,j2,j3)), 
-!                     a(2*j1+1,j2,j3) = Im(x(j1,j2,j3)), 
-!                     0<=j1<n1, 0<=j2<n2, 0<=j3<n3
-!                 output data
-!                     a(2*k1,k2,k3) = Re(X(k1,k2,k3)), 
-!                     a(2*k1+1,k2,k3) = Im(X(k1,k2,k3)), 
-!                     0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!         t(0:*) :work area (real*8)
-!                 length of t >= max(8*n2, 8*n3)
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1, n2, n3))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1/2, n2/2, n3/2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call cdft3d(n1max, n2max, 2*n1, n2, n3, -1, a, t, ip, w)
-!         is 
-!             call cdft3d(n1max, n2max, 2*n1, n2, n3, 1, a, t, ip, w)
-!             do j3 = 0, n3 - 1
-!                 do j2 = 0, n2 - 1
-!                     do j1 = 0, 2 * n1 - 1
-!                         a(j1,j2,j3) = a(j1,j2,j3) * (1.0d0/n1/n2/n3)
-!                     end do
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- Real DFT / Inverse of Real DFT --------
-!     [definition]
-!         <case1> RDFT
-!             R(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-!                               a(j1,j2,j3) * 
-!                               cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
-!                                   2*pi*j3*k3/n3), 
-!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!             I(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-!                               a(j1,j2,j3) * 
-!                               sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
-!                                   2*pi*j3*k3/n3), 
-!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!         <case2> IRDFT (excluding scale)
-!             a(k1,k2,k3) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-!                               (R(j1,j2,j3) * 
-!                               cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
-!                                   2*pi*j3*k3/n3) + 
-!                               I(j1,j2,j3) * 
-!                               sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
-!                                   2*pi*j3*k3/n3)), 
-!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!         (notes: R(mod(n1-k1,n1),mod(n2-k2,n2),mod(n3-k3,n3)) = R(k1,k2,k3), 
-!                 I(mod(n1-k1,n1),mod(n2-k2,n2),mod(n3-k3,n3)) = -I(k1,k2,k3), 
-!                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3)
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row1 size of the 3D array (integer)
-!         n2max  :row2 size of the 3D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         n3     :data length (integer)
-!                 n3 >= 2, n3 = power of 2
-!         a(0:n1-1,0:n2-1,0:n3-1)
-!                :input/output data (real*8)
-!                 <case1>
-!                     output data
-!                         a(2*k1,k2,k3) = R(k1,k2,k3)
-!                                 = R(n1-k1,mod(n2-k2,n2),mod(n3-k3,n3)), 
-!                         a(2*k1+1,k2,k3) = I(k1,k2,k3)
-!                                 = -I(n1-k1,mod(n2-k2,n2),mod(n3-k3,n3)), 
-!                             0<k1<n1/2, 0<=k2<n2, 0<=k3<n3, 
-!                         a(0,k2,k3) = R(0,k2,k3)
-!                                    = R(0,n2-k2,mod(n3-k3,n3)), 
-!                         a(1,k2,k3) = I(0,k2,k3)
-!                                    = -I(0,n2-k2,mod(n3-k3,n3)), 
-!                         a(1,n2-k2,k3) = R(n1/2,k2,mod(n3-k3,n3))
-!                                       = R(n1/2,n2-k2,k3), 
-!                         a(0,n2-k2,k3) = -I(n1/2,k2,mod(n3-k3,n3))
-!                                       = I(n1/2,n2-k2,k3), 
-!                             0<k2<n2/2, 0<=k3<n3, 
-!                         a(0,0,k3) = R(0,0,k3)
-!                                   = R(0,0,n3-k3), 
-!                         a(1,0,k3) = I(0,0,k3)
-!                                   = -I(0,0,n3-k3), 
-!                         a(0,n2/2,k3) = R(0,n2/2,k3)
-!                                      = R(0,n2/2,n3-k3), 
-!                         a(1,n2/2,k3) = I(0,n2/2,k3)
-!                                      = -I(0,n2/2,n3-k3), 
-!                         a(1,0,n3-k3) = R(n1/2,0,k3)
-!                                      = R(n1/2,0,n3-k3), 
-!                         a(0,0,n3-k3) = -I(n1/2,0,k3)
-!                                      = I(n1/2,0,n3-k3), 
-!                         a(1,n2/2,n3-k3) = R(n1/2,n2/2,k3)
-!                                         = R(n1/2,n2/2,n3-k3), 
-!                         a(0,n2/2,n3-k3) = -I(n1/2,n2/2,k3)
-!                                         = I(n1/2,n2/2,n3-k3), 
-!                             0<k3<n3/2, 
-!                         a(0,0,0) = R(0,0,0), 
-!                         a(1,0,0) = R(n1/2,0,0), 
-!                         a(0,0,n3/2) = R(0,0,n3/2), 
-!                         a(1,0,n3/2) = R(n1/2,0,n3/2), 
-!                         a(0,n2/2,0) = R(0,n2/2,0), 
-!                         a(1,n2/2,0) = R(n1/2,n2/2,0), 
-!                         a(0,n2/2,n3/2) = R(0,n2/2,n3/2), 
-!                         a(1,n2/2,n3/2) = R(n1/2,n2/2,n3/2)
-!                 <case2>
-!                     input data
-!                         a(2*j1,j2,j3) = R(j1,j2,j3)
-!                                 = R(n1-j1,mod(n2-j2,n2),mod(n3-j3,n3)), 
-!                         a(2*j1+1,j2,j3) = I(j1,j2,j3)
-!                                 = -I(n1-j1,mod(n2-j2,n2),mod(n3-j3,n3)), 
-!                             0<j1<n1/2, 0<=j2<n2, 0<=j3<n3, 
-!                         a(0,j2,j3) = R(0,j2,j3)
-!                                    = R(0,n2-j2,mod(n3-j3,n3)), 
-!                         a(1,j2,j3) = I(0,j2,j3)
-!                                    = -I(0,n2-j2,mod(n3-j3,n3)), 
-!                         a(1,n2-j2,j3) = R(n1/2,j2,mod(n3-j3,n3))
-!                                       = R(n1/2,n2-j2,j3), 
-!                         a(0,n2-j2,j3) = -I(n1/2,j2,mod(n3-j3,n3))
-!                                       = I(n1/2,n2-j2,j3), 
-!                             0<j2<n2/2, 0<=j3<n3, 
-!                         a(0,0,j3) = R(0,0,j3)
-!                                   = R(0,0,n3-j3), 
-!                         a(1,0,j3) = I(0,0,j3)
-!                                   = -I(0,0,n3-j3), 
-!                         a(0,n2/2,j3) = R(0,n2/2,j3)
-!                                      = R(0,n2/2,n3-j3), 
-!                         a(1,n2/2,j3) = I(0,n2/2,j3)
-!                                      = -I(0,n2/2,n3-j3), 
-!                         a(1,0,n3-j3) = R(n1/2,0,j3)
-!                                      = R(n1/2,0,n3-j3), 
-!                         a(0,0,n3-j3) = -I(n1/2,0,j3)
-!                                      = I(n1/2,0,n3-j3), 
-!                         a(1,n2/2,n3-j3) = R(n1/2,n2/2,j3)
-!                                         = R(n1/2,n2/2,n3-j3), 
-!                         a(0,n2/2,n3-j3) = -I(n1/2,n2/2,j3)
-!                                         = I(n1/2,n2/2,n3-j3), 
-!                             0<j3<n3/2, 
-!                         a(0,0,0) = R(0,0,0), 
-!                         a(1,0,0) = R(n1/2,0,0), 
-!                         a(0,0,n3/2) = R(0,0,n3/2), 
-!                         a(1,0,n3/2) = R(n1/2,0,n3/2), 
-!                         a(0,n2/2,0) = R(0,n2/2,0), 
-!                         a(1,n2/2,0) = R(n1/2,n2/2,0), 
-!                         a(0,n2/2,n3/2) = R(0,n2/2,n3/2), 
-!                         a(1,n2/2,n3/2) = R(n1/2,n2/2,n3/2)
-!                 ---- output ordering ----
-!                     call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
-!                     call rdft3dsort(n1max, n2max, n1, n2, n3, 1, a)
-!                     ! stored data is a(0:n1-1,0:n2-1,0:n3+1):
-!                     ! a(2*k1,k2,k3) = R(k1,k2,k3), 
-!                     ! a(2*k1+1,k2,k3) = I(k1,k2,k3), 
-!                     ! 0<=k1<=n1/2, 0<=k2<n2, 0<=k3<n3.
-!                     ! the stored data is larger than the input data!
-!                 ---- input ordering ----
-!                     call rdft3dsort(n1max, n2max, n1, n2, n3, -1, a)
-!                     call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
-!         t(0:*) :work area (real*8)
-!                 length of t >= max(8*n2, 8*n3)
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2, n3))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1/4, n2/2, n3/2) + n1/4
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
-!         is 
-!             call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
-!             do j3 = 0, n3 - 1
-!                 do j2 = 0, n2 - 1
-!                     do j1 = 0, n1 - 1
-!                         a(j1,j2,j3) = a(j1,j2,j3) * (2.0d0/n1/n2/n3)
-!                     end do
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
-!     [definition]
-!         <case1> IDCT (excluding scale)
-!             C(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-!                               a(j1,j2,j3) * 
-!                               cos(pi*j1*(k1+1/2)/n1) * 
-!                               cos(pi*j2*(k2+1/2)/n2) * 
-!                               cos(pi*j3*(k3+1/2)/n3), 
-!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!         <case2> DCT
-!             C(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-!                               a(j1,j2,j3) * 
-!                               cos(pi*(j1+1/2)*k1/n1) * 
-!                               cos(pi*(j2+1/2)*k2/n2) * 
-!                               cos(pi*(j3+1/2)*k3/n3), 
-!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call ddct3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call ddct3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row1 size of the 3D array (integer)
-!         n2max  :row2 size of the 3D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         n3     :data length (integer)
-!                 n3 >= 2, n3 = power of 2
-!         a(0:n1-1,0:n2-1,0:n3-1)
-!                :input/output data (real*8)
-!                 output data
-!                     a(k1,k2,k3) = C(k1,k2,k3), 
-!                         0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!         t(0:*) :work area (real*8)
-!                 length of t >= max(4*n2, 4*n3)
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2/2, n3/2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1*3/2, n2*3/2, n3*3/2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call ddct3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
-!         is 
-!             do j3 = 0, n3 - 1
-!                 do j2 = 0, n2 - 1
-!                     a(0, j2, j3) = a(0, j2, j3) * 0.5d0
-!                 end do
-!                 do j1 = 0, n1 - 1
-!                     a(j1, 0, j3) = a(j1, 0, j3) * 0.5d0
-!                 end do
-!             end do
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, n1 - 1
-!                     a(j1, j2, 0) = a(j1, j2, 0) * 0.5d0
-!                 end do
-!             end do
-!             call ddct3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
-!             do j3 = 0, n3 - 1
-!                 do j2 = 0, n2 - 1
-!                     do j1 = 0, n1 - 1
-!                         a(j1,j2,j3) = a(j1,j2,j3) * (8.0d0/n1/n2/n3)
-!                     end do
-!                 end do
-!             end do
-!         .
-!
-!
-! -------- DST (Discrete Sine Transform) / Inverse of DST --------
-!     [definition]
-!         <case1> IDST (excluding scale)
-!             S(k1,k2,k3) = sum_j1=1^n1 sum_j2=1^n2 sum_j3=1^n3
-!                               A(j1,j2,j3) * 
-!                               sin(pi*j1*(k1+1/2)/n1) * 
-!                               sin(pi*j2*(k2+1/2)/n2) * 
-!                               sin(pi*j3*(k3+1/2)/n3), 
-!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!         <case2> DST
-!             S(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
-!                               a(j1,j2,j3) * 
-!                               sin(pi*(j1+1/2)*k1/n1) * 
-!                               sin(pi*(j2+1/2)*k2/n2) * 
-!                               sin(pi*(j3+1/2)*k3/n3), 
-!                               0<k1<=n1, 0<k2<=n2, 0<k3<=n3
-!     [usage]
-!         <case1>
-!             ip(0) = 0  ! first time only
-!             call ddst3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
-!         <case2>
-!             ip(0) = 0  ! first time only
-!             call ddst3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
-!     [parameters]
-!         n1max  :row1 size of the 3D array (integer)
-!         n2max  :row2 size of the 3D array (integer)
-!         n1     :data length (integer)
-!                 n1 >= 2, n1 = power of 2
-!         n2     :data length (integer)
-!                 n2 >= 2, n2 = power of 2
-!         n3     :data length (integer)
-!                 n3 >= 2, n3 = power of 2
-!         a(0:n1-1,0:n2-1,0:n3-1)
-!                :input/output data (real*8)
-!                 <case1>
-!                     input data
-!                         a(mod(j1,n1),mod(j2,n2),mod(j3,n3)) = A(j1,j2,j3), 
-!                             0<j1<=n1, 0<j2<=n2, 0<j3<=n3
-!                     output data
-!                         a(k1,k2,k3) = S(k1,k2,k3), 
-!                             0<=k1<n1, 0<=k2<n2, 0<=k3<n3
-!                 <case2>
-!                     output data
-!                         a(mod(k1,n1),mod(k2,n2),mod(k3,n3)) = S(k1,k2,k3), 
-!                             0<k1<=n1, 0<k2<=n2, 0<k3<=n3
-!         t(0:*) :work area (real*8)
-!                 length of t >= max(4*n2, 4*n3)
-!         ip(0:*):work area for bit reversal (integer)
-!                 length of ip >= 2+sqrt(n)
-!                 (n = max(n1/2, n2/2, n3/2))
-!                 ip(0),ip(1) are pointers of the cos/sin table.
-!         w(0:*) :cos/sin table (real*8)
-!                 length of w >= max(n1*3/2, n2*3/2, n3*3/2)
-!                 w(),ip() are initialized if ip(0) = 0.
-!     [remark]
-!         Inverse of 
-!             call ddst3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
-!         is 
-!             do j3 = 0, n3 - 1
-!                 do j2 = 0, n2 - 1
-!                     a(0, j2, j3) = a(0, j2, j3) * 0.5d0
-!                 end do
-!                 do j1 = 0, n1 - 1
-!                     a(j1, 0, j3) = a(j1, 0, j3) * 0.5d0
-!                 end do
-!             end do
-!             do j2 = 0, n2 - 1
-!                 do j1 = 0, n1 - 1
-!                     a(j1, j2, 0) = a(j1, j2, 0) * 0.5d0
-!                 end do
-!             end do
-!             call ddst3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
-!             do j3 = 0, n3 - 1
-!                 do j2 = 0, n2 - 1
-!                     do j1 = 0, n1 - 1
-!                         a(j1,j2,j3) = a(j1,j2,j3) * (8.0d0/n1/n2/n3)
-!                     end do
-!                 end do
-!             end do
-!         .
-!
-!
-      subroutine cdft3d(n1max, n2max, n1, n2, n3, isgn, a, 
-     &    t, ip, w)
-      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), n
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
-     &    t(0 : *), w(0 : *)
-      n = 2 * max(n2, n3)
-      n = max(n, n1)
-      if (n .gt. 4 * ip(0)) then
-          call makewt(n / 4, ip, w)
-      end if
-      call xdft3da_sub(n1max, n2max, n1, n2, n3, 0, 
-     &    isgn, a, t, ip, w)
-      call cdft3db_sub(n1max, n2max, n1, n2, n3, 
-     &    isgn, a, t, ip, w)
-      end
-!
-      subroutine rdft3d(n1max, n2max, n1, n2, n3, isgn, a, 
-     &    t, ip, w)
-      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), 
-     &    n, nw, nc
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
-     &    t(0 : *), w(0 : *)
-      n = 2 * max(n2, n3)
-      n = max(n, n1)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n1 .gt. 4 * nc) then
-          nc = n1 / 4
-          call makect(nc, ip, w(nw))
-      end if
-      if (isgn .lt. 0) then
-          call rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
-          call cdft3db_sub(n1max, n2max, n1, n2, n3, 
-     &        isgn, a, t, ip, w)
-          call xdft3da_sub(n1max, n2max, n1, n2, n3, 1, 
-     &        isgn, a, t, ip, w)
-      else
-          call xdft3da_sub(n1max, n2max, n1, n2, n3, 1, 
-     &        isgn, a, t, ip, w)
-          call cdft3db_sub(n1max, n2max, n1, n2, n3, 
-     &        isgn, a, t, ip, w)
-          call rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
-      end if
-      end
-!
-      subroutine rdft3dsort(n1max, n2max, n1, n2, n3, isgn, a)
-      integer n1max, n2max, n1, n2, n3, isgn, n2h, n3h, j, k
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), x, y
-      n2h = n2 / 2
-      n3h = n3 / 2
-      if (isgn .lt. 0) then
-          do k = 0, n3 - 1
-              do j = n2h + 1, n2 - 1
-                  a(0, j, k) = a(n1 + 1, j, k)
-                  a(1, j, k) = a(n1, j, k)
-              end do
-          end do
-          do k = n3h + 1, n3 - 1
-              a(0, 0, k) = a(n1 + 1, 0, k)
-              a(1, 0, k) = a(n1, 0, k)
-              a(0, n2h, k) = a(n1 + 1, n2h, k)
-              a(1, n2h, k) = a(n1, n2h, k)
-          end do
-          a(1, 0, 0) = a(n1, 0, 0)
-          a(1, n2h, 0) = a(n1, n2h, 0)
-          a(1, 0, n3h) = a(n1, 0, n3h)
-          a(1, n2h, n3h) = a(n1, n2h, n3h)
-      else
-          do j = n2h + 1, n2 - 1
-              y = a(0, j, 0)
-              x = a(1, j, 0)
-              a(n1, j, 0) = x
-              a(n1 + 1, j, 0) = y
-              a(n1, n2 - j, 0) = x
-              a(n1 + 1, n2 - j, 0) = -y
-              a(0, j, 0) = a(0, n2 - j, 0)
-              a(1, j, 0) = -a(1, n2 - j, 0)
-          end do
-          do k = 1, n3 - 1
-              do j = n2h + 1, n2 - 1
-                  y = a(0, j, k)
-                  x = a(1, j, k)
-                  a(n1, j, k) = x
-                  a(n1 + 1, j, k) = y
-                  a(n1, n2 - j, n3 - k) = x
-                  a(n1 + 1, n2 - j, n3 - k) = -y
-                  a(0, j, k) = a(0, n2 - j, n3 - k)
-                  a(1, j, k) = -a(1, n2 - j, n3 - k)
-              end do
-          end do
-          do k = n3h + 1, n3 - 1
-              y = a(0, 0, k)
-              x = a(1, 0, k)
-              a(n1, 0, k) = x
-              a(n1 + 1, 0, k) = y
-              a(n1, 0, n3 - k) = x
-              a(n1 + 1, 0, n3 - k) = -y
-              a(0, 0, k) = a(0, 0, n3 - k)
-              a(1, 0, k) = -a(1, 0, n3 - k)
-              y = a(0, n2h, k)
-              x = a(1, n2h, k)
-              a(n1, n2h, k) = x
-              a(n1 + 1, n2h, k) = y
-              a(n1, n2h, n3 - k) = x
-              a(n1 + 1, n2h, n3 - k) = -y
-              a(0, n2h, k) = a(0, n2h, n3 - k)
-              a(1, n2h, k) = -a(1, n2h, n3 - k)
-          end do
-          a(n1, 0, 0) = a(1, 0, 0)
-          a(n1 + 1, 0, 0) = 0
-          a(1, 0, 0) = 0
-          a(n1, n2h, 0) = a(1, n2h, 0)
-          a(n1 + 1, n2h, 0) = 0
-          a(1, n2h, 0) = 0
-          a(n1, 0, n3h) = a(1, 0, n3h)
-          a(n1 + 1, 0, n3h) = 0
-          a(1, 0, n3h) = 0
-          a(n1, n2h, n3h) = a(1, n2h, n3h)
-          a(n1 + 1, n2h, n3h) = 0
-          a(1, n2h, n3h) = 0
-      end if
-      end
-!
-      subroutine ddct3d(n1max, n2max, n1, n2, n3, isgn, a, 
-     &    t, ip, w)
-      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), 
-     &    n, nw, nc
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
-     &    t(0 : *), w(0 : *)
-      n = max(n2, n3)
-      n = max(n, n1)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. nc) then
-          nc = n
-          call makect(nc, ip, w(nw))
-      end if
-      call ddxt3da_sub(n1max, n2max, n1, n2, n3, 0, 
-     &    isgn, a, t, ip, w)
-      call ddxt3db_sub(n1max, n2max, n1, n2, n3, 0, 
-     &    isgn, a, t, ip, w)
-      end
-!
-      subroutine ddst3d(n1max, n2max, n1, n2, n3, isgn, a, 
-     &    t, ip, w)
-      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), 
-     &    n, nw, nc
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
-     &    t(0 : *), w(0 : *)
-      n = max(n2, n3)
-      n = max(n, n1)
-      nw = ip(0)
-      if (n .gt. 4 * nw) then
-          nw = n / 4
-          call makewt(nw, ip, w)
-      end if
-      nc = ip(1)
-      if (n .gt. nc) then
-          nc = n
-          call makect(nc, ip, w(nw))
-      end if
-      call ddxt3da_sub(n1max, n2max, n1, n2, n3, 1, 
-     &    isgn, a, t, ip, w)
-      call ddxt3db_sub(n1max, n2max, n1, n2, n3, 1, 
-     &    isgn, a, t, ip, w)
-      end
-!
-! -------- child routines --------
-!
-      subroutine xdft3da_sub(n1max, n2max, n1, n2, n3, icr, 
-     &    isgn, a, t, ip, w)
-      integer n1max, n2max, n1, n2, n3, icr, isgn, 
-     &    ip(0 : *), i, j, k
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
-     &    t(0 : *), w(0 : *)
-      do k = 0, n3 - 1
-          if (icr .eq. 0) then
-              do j = 0, n2 - 1
-                  call cdft(n1, isgn, a(0, j, k), ip, w)
-              end do
-          else if (isgn .ge. 0) then
-              do j = 0, n2 - 1
-                  call rdft(n1, isgn, a(0, j, k), ip, w)
-              end do
-          end if
-          if (n1 .gt. 4) then
-              do i = 0, n1 - 8, 8
-                  do j = 0, n2 - 1
-                      t(2 * j) = a(i, j, k)
-                      t(2 * j + 1) = a(i + 1, j, k)
-                      t(2 * n2 + 2 * j) = a(i + 2, j, k)
-                      t(2 * n2 + 2 * j + 1) = a(i + 3, j, k)
-                      t(4 * n2 + 2 * j) = a(i + 4, j, k)
-                      t(4 * n2 + 2 * j + 1) = a(i + 5, j, k)
-                      t(6 * n2 + 2 * j) = a(i + 6, j, k)
-                      t(6 * n2 + 2 * j + 1) = a(i + 7, j, k)
-                  end do
-                  call cdft(2 * n2, isgn, t, ip, w)
-                  call cdft(2 * n2, isgn, t(2 * n2), ip, w)
-                  call cdft(2 * n2, isgn, t(4 * n2), ip, w)
-                  call cdft(2 * n2, isgn, t(6 * n2), ip, w)
-                  do j = 0, n2 - 1
-                      a(i, j, k) = t(2 * j)
-                      a(i + 1, j, k) = t(2 * j + 1)
-                      a(i + 2, j, k) = t(2 * n2 + 2 * j)
-                      a(i + 3, j, k) = t(2 * n2 + 2 * j + 1)
-                      a(i + 4, j, k) = t(4 * n2 + 2 * j)
-                      a(i + 5, j, k) = t(4 * n2 + 2 * j + 1)
-                      a(i + 6, j, k) = t(6 * n2 + 2 * j)
-                      a(i + 7, j, k) = t(6 * n2 + 2 * j + 1)
-                  end do
-              end do
-          else if (n1 .eq. 4) then
-              do j = 0, n2 - 1
-                  t(2 * j) = a(0, j, k)
-                  t(2 * j + 1) = a(1, j, k)
-                  t(2 * n2 + 2 * j) = a(2, j, k)
-                  t(2 * n2 + 2 * j + 1) = a(3, j, k)
-              end do
-              call cdft(2 * n2, isgn, t, ip, w)
-              call cdft(2 * n2, isgn, t(2 * n2), ip, w)
-              do j = 0, n2 - 1
-                  a(0, j, k) = t(2 * j)
-                  a(1, j, k) = t(2 * j + 1)
-                  a(2, j, k) = t(2 * n2 + 2 * j)
-                  a(3, j, k) = t(2 * n2 + 2 * j + 1)
-              end do
-          else if (n1 .eq. 2) then
-              do j = 0, n2 - 1
-                  t(2 * j) = a(0, j, k)
-                  t(2 * j + 1) = a(1, j, k)
-              end do
-              call cdft(2 * n2, isgn, t, ip, w)
-              do j = 0, n2 - 1
-                  a(0, j, k) = t(2 * j)
-                  a(1, j, k) = t(2 * j + 1)
-              end do
-          end if
-          if (icr .ne. 0 .and. isgn .lt. 0) then
-              do j = 0, n2 - 1
-                  call rdft(n1, isgn, a(0, j, k), ip, w)
-              end do
-          end if
-      end do
-      end
-!
-      subroutine cdft3db_sub(n1max, n2max, n1, n2, n3, 
-     &    isgn, a, t, ip, w)
-      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), 
-     &    i, j, k
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
-     &    t(0 : *), w(0 : *)
-      if (n1 .gt. 4) then
-          do j = 0, n2 - 1
-              do i = 0, n1 - 8, 8
-                  do k = 0, n3 - 1
-                      t(2 * k) = a(i, j, k)
-                      t(2 * k + 1) = a(i + 1, j, k)
-                      t(2 * n3 + 2 * k) = a(i + 2, j, k)
-                      t(2 * n3 + 2 * k + 1) = a(i + 3, j, k)
-                      t(4 * n3 + 2 * k) = a(i + 4, j, k)
-                      t(4 * n3 + 2 * k + 1) = a(i + 5, j, k)
-                      t(6 * n3 + 2 * k) = a(i + 6, j, k)
-                      t(6 * n3 + 2 * k + 1) = a(i + 7, j, k)
-                  end do
-                  call cdft(2 * n3, isgn, t, ip, w)
-                  call cdft(2 * n3, isgn, t(2 * n3), ip, w)
-                  call cdft(2 * n3, isgn, t(4 * n3), ip, w)
-                  call cdft(2 * n3, isgn, t(6 * n3), ip, w)
-                  do k = 0, n3 - 1
-                      a(i, j, k) = t(2 * k)
-                      a(i + 1, j, k) = t(2 * k + 1)
-                      a(i + 2, j, k) = t(2 * n3 + 2 * k)
-                      a(i + 3, j, k) = t(2 * n3 + 2 * k + 1)
-                      a(i + 4, j, k) = t(4 * n3 + 2 * k)
-                      a(i + 5, j, k) = t(4 * n3 + 2 * k + 1)
-                      a(i + 6, j, k) = t(6 * n3 + 2 * k)
-                      a(i + 7, j, k) = t(6 * n3 + 2 * k + 1)
-                  end do
-              end do
-          end do
-      else if (n1 .eq. 4) then
-          do j = 0, n2 - 1
-              do k = 0, n3 - 1
-                  t(2 * k) = a(0, j, k)
-                  t(2 * k + 1) = a(1, j, k)
-                  t(2 * n3 + 2 * k) = a(2, j, k)
-                  t(2 * n3 + 2 * k + 1) = a(3, j, k)
-              end do
-              call cdft(2 * n3, isgn, t, ip, w)
-              call cdft(2 * n3, isgn, t(2 * n3), ip, w)
-              do k = 0, n3 - 1
-                  a(0, j, k) = t(2 * k)
-                  a(1, j, k) = t(2 * k + 1)
-                  a(2, j, k) = t(2 * n3 + 2 * k)
-                  a(3, j, k) = t(2 * n3 + 2 * k + 1)
-              end do
-          end do
-      else if (n1 .eq. 2) then
-          do j = 0, n2 - 1
-              do k = 0, n3 - 1
-                  t(2 * k) = a(0, j, k)
-                  t(2 * k + 1) = a(1, j, k)
-              end do
-              call cdft(2 * n3, isgn, t, ip, w)
-              do k = 0, n3 - 1
-                  a(0, j, k) = t(2 * k)
-                  a(1, j, k) = t(2 * k + 1)
-              end do
-          end do
-      end if
-      end
-!
-      subroutine rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
-      integer n1max, n2max, n1, n2, n3, isgn, 
-     &    n2h, n3h, i, j, k, l
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), xi
-      n2h = n2 / 2
-      n3h = n3 / 2
-      if (isgn .lt. 0) then
-          do k = 1, n3h - 1
-              l = n3 - k
-              xi = a(0, 0, k) - a(0, 0, l)
-              a(0, 0, k) = a(0, 0, k) + a(0, 0, l)
-              a(0, 0, l) = xi
-              xi = a(1, 0, l) - a(1, 0, k)
-              a(1, 0, k) = a(1, 0, k) + a(1, 0, l)
-              a(1, 0, l) = xi
-              xi = a(0, n2h, k) - a(0, n2h, l)
-              a(0, n2h, k) = a(0, n2h, k) + a(0, n2h, l)
-              a(0, n2h, l) = xi
-              xi = a(1, n2h, l) - a(1, n2h, k)
-              a(1, n2h, k) = a(1, n2h, k) + a(1, n2h, l)
-              a(1, n2h, l) = xi
-              do i = 1, n2h - 1
-                  j = n2 - i
-                  xi = a(0, i, k) - a(0, j, l)
-                  a(0, i, k) = a(0, i, k) + a(0, j, l)
-                  a(0, j, l) = xi
-                  xi = a(1, j, l) - a(1, i, k)
-                  a(1, i, k) = a(1, i, k) + a(1, j, l)
-                  a(1, j, l) = xi
-                  xi = a(0, i, l) - a(0, j, k)
-                  a(0, i, l) = a(0, i, l) + a(0, j, k)
-                  a(0, j, k) = xi
-                  xi = a(1, j, k) - a(1, i, l)
-                  a(1, i, l) = a(1, i, l) + a(1, j, k)
-                  a(1, j, k) = xi
-              end do
-          end do
-          do i = 1, n2h - 1
-              j = n2 - i
-              xi = a(0, i, 0) - a(0, j, 0)
-              a(0, i, 0) = a(0, i, 0) + a(0, j, 0)
-              a(0, j, 0) = xi
-              xi = a(1, j, 0) - a(1, i, 0)
-              a(1, i, 0) = a(1, i, 0) + a(1, j, 0)
-              a(1, j, 0) = xi
-              xi = a(0, i, n3h) - a(0, j, n3h)
-              a(0, i, n3h) = a(0, i, n3h) + a(0, j, n3h)
-              a(0, j, n3h) = xi
-              xi = a(1, j, n3h) - a(1, i, n3h)
-              a(1, i, n3h) = a(1, i, n3h) + a(1, j, n3h)
-              a(1, j, n3h) = xi
-          end do
-      else
-          do k = 1, n3h - 1
-              l = n3 - k
-              a(0, 0, l) = 0.5d0 * (a(0, 0, k) - a(0, 0, l))
-              a(0, 0, k) = a(0, 0, k) - a(0, 0, l)
-              a(1, 0, l) = 0.5d0 * (a(1, 0, k) + a(1, 0, l))
-              a(1, 0, k) = a(1, 0, k) - a(1, 0, l)
-              a(0, n2h, l) = 0.5d0 * (a(0, n2h, k) - a(0, n2h, l))
-              a(0, n2h, k) = a(0, n2h, k) - a(0, n2h, l)
-              a(1, n2h, l) = 0.5d0 * (a(1, n2h, k) + a(1, n2h, l))
-              a(1, n2h, k) = a(1, n2h, k) - a(1, n2h, l)
-              do i = 1, n2h - 1
-                  j = n2 - i
-                  a(0, j, l) = 0.5d0 * (a(0, i, k) - a(0, j, l))
-                  a(0, i, k) = a(0, i, k) - a(0, j, l)
-                  a(1, j, l) = 0.5d0 * (a(1, i, k) + a(1, j, l))
-                  a(1, i, k) = a(1, i, k) - a(1, j, l)
-                  a(0, j, k) = 0.5d0 * (a(0, i, l) - a(0, j, k))
-                  a(0, i, l) = a(0, i, l) - a(0, j, k)
-                  a(1, j, k) = 0.5d0 * (a(1, i, l) + a(1, j, k))
-                  a(1, i, l) = a(1, i, l) - a(1, j, k)
-              end do
-          end do
-          do i = 1, n2h - 1
-              j = n2 - i
-              a(0, j, 0) = 0.5d0 * (a(0, i, 0) - a(0, j, 0))
-              a(0, i, 0) = a(0, i, 0) - a(0, j, 0)
-              a(1, j, 0) = 0.5d0 * (a(1, i, 0) + a(1, j, 0))
-              a(1, i, 0) = a(1, i, 0) - a(1, j, 0)
-              a(0, j, n3h) = 0.5d0 * (a(0, i, n3h) - a(0, j, n3h))
-              a(0, i, n3h) = a(0, i, n3h) - a(0, j, n3h)
-              a(1, j, n3h) = 0.5d0 * (a(1, i, n3h) + a(1, j, n3h))
-              a(1, i, n3h) = a(1, i, n3h) - a(1, j, n3h)
-          end do
-      end if
-      end
-!
-      subroutine ddxt3da_sub(n1max, n2max, n1, n2, n3, ics, 
-     &    isgn, a, t, ip, w)
-      integer n1max, n2max, n1, n2, n3, ics, isgn, 
-     &    ip(0 : *), i, j, k
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
-     &    t(0 : *), w(0 : *)
-      do k = 0, n3 - 1
-          if (ics .eq. 0) then
-              do j = 0, n2 - 1
-                  call ddct(n1, isgn, a(0, j, k), ip, w)
-              end do
-          else
-              do j = 0, n2 - 1
-                  call ddst(n1, isgn, a(0, j, k), ip, w)
-              end do
-          end if
-          if (n1 .gt. 2) then
-              do i = 0, n1 - 4, 4
-                  do j = 0, n2 - 1
-                      t(j) = a(i, j, k)
-                      t(n2 + j) = a(i + 1, j, k)
-                      t(2 * n2 + j) = a(i + 2, j, k)
-                      t(3 * n2 + j) = a(i + 3, j, k)
-                  end do
-                  if (ics .eq. 0) then
-                      call ddct(n2, isgn, t, ip, w)
-                      call ddct(n2, isgn, t(n2), ip, w)
-                      call ddct(n2, isgn, t(2 * n2), ip, w)
-                      call ddct(n2, isgn, t(3 * n2), ip, w)
-                  else
-                      call ddst(n2, isgn, t, ip, w)
-                      call ddst(n2, isgn, t(n2), ip, w)
-                      call ddst(n2, isgn, t(2 * n2), ip, w)
-                      call ddst(n2, isgn, t(3 * n2), ip, w)
-                  end if
-                  do j = 0, n2 - 1
-                      a(i, j, k) = t(j)
-                      a(i + 1, j, k) = t(n2 + j)
-                      a(i + 2, j, k) = t(2 * n2 + j)
-                      a(i + 3, j, k) = t(3 * n2 + j)
-                  end do
-              end do
-          else if (n1 .eq. 2) then
-              do j = 0, n2 - 1
-                  t(j) = a(0, j, k)
-                  t(n2 + j) = a(1, j, k)
-              end do
-              if (ics .eq. 0) then
-                  call ddct(n2, isgn, t, ip, w)
-                  call ddct(n2, isgn, t(n2), ip, w)
-              else
-                  call ddst(n2, isgn, t, ip, w)
-                  call ddst(n2, isgn, t(n2), ip, w)
-              end if
-              do j = 0, n2 - 1
-                  a(0, j, k) = t(j)
-                  a(1, j, k) = t(n2 + j)
-              end do
-          end if
-      end do
-      end
-!
-      subroutine ddxt3db_sub(n1max, n2max, n1, n2, n3, ics, 
-     &    isgn, a, t, ip, w)
-      integer n1max, n2max, n1, n2, n3, ics, isgn, 
-     &    ip(0 : *), i, j, k
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
-     &    t(0 : *), w(0 : *)
-      if (n1 .gt. 2) then
-          do j = 0, n2 - 1
-              do i = 0, n1 - 4, 4
-                  do k = 0, n3 - 1
-                      t(k) = a(i, j, k)
-                      t(n3 + k) = a(i + 1, j, k)
-                      t(2 * n3 + k) = a(i + 2, j, k)
-                      t(3 * n3 + k) = a(i + 3, j, k)
-                  end do
-                  if (ics .eq. 0) then
-                      call ddct(n3, isgn, t, ip, w)
-                      call ddct(n3, isgn, t(n3), ip, w)
-                      call ddct(n3, isgn, t(2 * n3), ip, w)
-                      call ddct(n3, isgn, t(3 * n3), ip, w)
-                  else
-                      call ddst(n3, isgn, t, ip, w)
-                      call ddst(n3, isgn, t(n3), ip, w)
-                      call ddst(n3, isgn, t(2 * n3), ip, w)
-                      call ddst(n3, isgn, t(3 * n3), ip, w)
-                  end if
-                  do k = 0, n3 - 1
-                      a(i, j, k) = t(k)
-                      a(i + 1, j, k) = t(n3 + k)
-                      a(i + 2, j, k) = t(2 * n3 + k)
-                      a(i + 3, j, k) = t(3 * n3 + k)
-                  end do
-              end do
-          end do
-      else if (n1 .eq. 2) then
-          do j = 0, n2 - 1
-              do k = 0, n3 - 1
-                  t(k) = a(0, j, k)
-                  t(n3 + k) = a(1, j, k)
-              end do
-              if (ics .eq. 0) then
-                  call ddct(n3, isgn, t, ip, w)
-                  call ddct(n3, isgn, t(n3), ip, w)
-              else
-                  call ddst(n3, isgn, t, ip, w)
-                  call ddst(n3, isgn, t(n3), ip, w)
-              end if
-              do k = 0, n3 - 1
-                  a(0, j, k) = t(k)
-                  a(1, j, k) = t(n3 + k)
-              end do
-          end do
-      end if
-      end
-!
diff --git a/third_party/tensorflow_dependencies/fft2d/readme2d.txt b/third_party/tensorflow_dependencies/fft2d/readme2d.txt
deleted file mode 100644
index 7cc4dfa..0000000
--- a/third_party/tensorflow_dependencies/fft2d/readme2d.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-General Purpose 2D,3D FFT (Fast Fourier Transform) Package
-
-Files
-    alloc.c    : 2D-array Allocation
-    alloc.h    : 2D-array Allocation
-    fft4f2d.c  : 2D FFT Package in C       - Version I (radix 4, 2)
-    fft4f2d.f  : 2D FFT Package in Fortran - Version I (radix 4, 2)
-    fftsg.c    : 1D FFT Package in C       - Fast Version (Split-Radix)
-    fftsg.f    : 1D FFT Package in Fortran - Fast Version (Split-Radix)
-    fftsg2d.c  : 2D FFT Package in C       - Version II (Split-Radix)
-    fftsg2d.f  : 2D FFT Package in Fortran - Version II (Split-Radix)
-    fftsg3d.c  : 3D FFT Package in C       - Version II (Split-Radix)
-    fftsg3d.f  : 3D FFT Package in Fortran - Version II (Split-Radix)
-    shrtdct.c  : 8x8, 16x16 DCT Package
-    sample2d/
-        Makefile    : for gcc, cc
-        Makefile.f77: for Fortran
-        Makefile.pth: Pthread version
-        fft4f2dt.c  : Test Program for "fft4f2d.c"
-        fft4f2dt.f  : Test Program for "fft4f2d.f"
-        fftsg2dt.c  : Test Program for "fftsg2d.c"
-        fftsg2dt.f  : Test Program for "fftsg2d.f"
-        fftsg3dt.c  : Test Program for "fftsg3d.c"
-        fftsg3dt.f  : Test Program for "fftsg3d.f"
-        shrtdctt.c  : Test Program for "shrtdct.c"
-
-Difference of Files
-    C and Fortran versions are equal and 
-    the same routines are in each version.
-    ---- Difference between "fft4f2d.*" and "fftsg2d.*" ----
-    "fft4f2d.*" are optimized for the old machines that 
-    don't have the large size CPU cache.
-    "fftsg2d.*", "fftsg3d.*" use 1D FFT routines in "fftsg.*".
-    "fftsg2d.*", "fftsg3d.*" are optimized for the machines that 
-    have the multi-level (L1,L2,etc) cache.
-
-Routines in the Package
-    in fft4f2d.*, fftsg2d.*
-        cdft2d: 2-dim Complex Discrete Fourier Transform
-        rdft2d: 2-dim Real Discrete Fourier Transform
-        ddct2d: 2-dim Discrete Cosine Transform
-        ddst2d: 2-dim Discrete Sine Transform
-        rdft2dsort: rdft2d input/output ordering (fftsg2d.*)
-    in fftsg3d.*
-        cdft3d: 3-dim Complex Discrete Fourier Transform
-        rdft3d: 3-dim Real Discrete Fourier Transform
-        ddct3d: 3-dim Discrete Cosine Transform
-        ddst3d: 3-dim Discrete Sine Transform
-        rdft3dsort: rdft3d input/output ordering
-    in fftsg.*
-        cdft: 1-dim Complex Discrete Fourier Transform
-        rdft: 1-dim Real Discrete Fourier Transform
-        ddct: 1-dim Discrete Cosine Transform
-        ddst: 1-dim Discrete Sine Transform
-        dfct: 1-dim Real Symmetric DFT
-        dfst: 1-dim Real Anti-symmetric DFT
-        (these routines are called by fftsg2d.*, fftsg3d.*)
-    in shrtdct.c
-        ddct8x8s  : Normalized 8x8 DCT
-        ddct16x16s: Normalized 16x16 DCT
-        (faster than ddct2d())
-
-Usage
-    Brief explanations are in block comments of each packages.
-    The examples are given in the test programs.
-
-Copyright
-    Copyright(C) 1997,2001 Takuya OOURA (email: ooura@kurims.kyoto-u.ac.jp).
-    You may use, copy, modify this code for any purpose and 
-    without fee. You may distribute this ORIGINAL package.
-
-History
-    ...
-    Nov. 2001  : Add 3D-FFT routines
-    Dec. 2006  : Fix a documentation bug in "fftsg3d.*"
-    Dec. 2006  : Fix a minor bug in "fftsg.f"
-
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile b/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile
deleted file mode 100644
index 130e0a4..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile
+++ /dev/null
@@ -1,72 +0,0 @@
-# ---- for GNU gcc ----
-
-CC = gcc
-
-CFLAGS = -Wall
-
-OFLAGS = -O2
-
-# ---- for SUN WS cc ----
-#
-#CC = cc
-#
-#CFLAGS = 
-#
-#OFLAGS = -xO2
-
-
-
-
-all: fft4f2dt fftsg2dt fftsg3dt shrtdctt
-
-
-fft4f2dt : fft4f2dt.o fft4f2d.o alloc.o
-	$(CC) fft4f2dt.o fft4f2d.o alloc.o -lm -o fft4f2dt
-
-fftsg2dt : fftsg2dt.o fftsg2d.o fftsg.o alloc.o
-	$(CC) fftsg2dt.o fftsg2d.o fftsg.o alloc.o -lm -o fftsg2dt
-
-fftsg3dt : fftsg3dt.o fftsg3d.o fftsg.o alloc.o
-	$(CC) fftsg3dt.o fftsg3d.o fftsg.o alloc.o -lm -o fftsg3dt
-
-shrtdctt : shrtdctt.o shrtdct.o
-	$(CC) shrtdctt.o shrtdct.o -lm -o shrtdctt
-
-
-fft4f2dt.o : fft4f2dt.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c fft4f2dt.c -o fft4f2dt.o
-
-fftsg2dt.o : fftsg2dt.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c fftsg2dt.c -o fftsg2dt.o
-
-fftsg3dt.o : fftsg3dt.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c fftsg3dt.c -o fftsg3dt.o
-
-shrtdctt.o : shrtdctt.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c shrtdctt.c -o shrtdctt.o
-
-
-fft4f2d.o : ../fft4f2d.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../fft4f2d.c -o fft4f2d.o
-
-fftsg2d.o : ../fftsg2d.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg2d.c -o fftsg2d.o
-
-fftsg3d.o : ../fftsg3d.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg3d.c -o fftsg3d.o
-
-fftsg.o : ../fftsg.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg.c -o fftsg.o
-
-alloc.o : ../alloc.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../alloc.c -o alloc.o
-
-shrtdct.o : ../shrtdct.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../shrtdct.c -o shrtdct.o
-
-
-
-
-clean:
-	rm -f *.o
-
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile.f77 b/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile.f77
deleted file mode 100644
index 0162fdc..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile.f77
+++ /dev/null
@@ -1,60 +0,0 @@
-# ---- for GNU g77 ----
-
-F77 = g77
-
-FFLAGS = -Wall
-
-OFLAGS = -O2
-
-# ---- for SUN WS f77 ----
-#
-#F77 = f77
-#
-#FFLAGS = 
-#
-#OFLAGS = -xO2
-
-
-
-
-all: fft4f2dt_f fftsg2dt_f fftsg3dt_f
-
-
-fft4f2dt_f : fft4f2dt_f.o fft4f2d_f.o
-	$(F77) fft4f2dt_f.o fft4f2d_f.o -o fft4f2dt_f
-
-fftsg2dt_f : fftsg2dt_f.o fftsg2d_f.o fftsg_f.o
-	$(F77) fftsg2dt_f.o fftsg2d_f.o fftsg_f.o -o fftsg2dt_f
-
-fftsg3dt_f : fftsg3dt_f.o fftsg3d_f.o fftsg_f.o
-	$(F77) fftsg3dt_f.o fftsg3d_f.o fftsg_f.o -o fftsg3dt_f
-
-
-fft4f2dt_f.o : fft4f2dt.f
-	$(F77) $(FFLAGS) $(OFLAGS) -c fft4f2dt.f -o fft4f2dt_f.o
-
-fftsg2dt_f.o : fftsg2dt.f
-	$(F77) $(FFLAGS) $(OFLAGS) -c fftsg2dt.f -o fftsg2dt_f.o
-
-fftsg3dt_f.o : fftsg3dt.f
-	$(F77) $(FFLAGS) $(OFLAGS) -c fftsg3dt.f -o fftsg3dt_f.o
-
-
-fft4f2d_f.o : ../fft4f2d.f
-	$(F77) $(FFLAGS) $(OFLAGS) -c ../fft4f2d.f -o fft4f2d_f.o
-
-fftsg2d_f.o : ../fftsg2d.f
-	$(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg2d.f -o fftsg2d_f.o
-
-fftsg3d_f.o : ../fftsg3d.f
-	$(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg3d.f -o fftsg3d_f.o
-
-fftsg_f.o : ../fftsg.f
-	$(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg.f -o fftsg_f.o
-
-
-
-
-clean:
-	rm -f *.o
-
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile.pth b/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile.pth
deleted file mode 100644
index d46941e..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/Makefile.pth
+++ /dev/null
@@ -1,54 +0,0 @@
-# ---- for GNU gcc ----
-
-CC = gcc
-
-CFLAGS = -Wall -DUSE_FFT2D_PTHREADS -DUSE_FFT3D_PTHREADS
-
-OFLAGS = -O2
-
-# ---- for SUN WS cc ----
-#
-#CC = cc
-#
-#CFLAGS = -Wall -DUSE_FFT2D_PTHREADS -DUSE_FFT3D_PTHREADS
-#
-#OFLAGS = -xO2
-
-
-
-
-all: fftsg2dt_pt fftsg3dt_pt
-
-
-fftsg2dt_pt : fftsg2dt.o fftsg2dpt.o fftsg.o alloc.o
-	$(CC) fftsg2dt.o fftsg2dpt.o fftsg.o alloc.o -lm -lpthread -o fftsg2dt_pt
-
-fftsg3dt_pt : fftsg3dt.o fftsg3dpt.o fftsg.o alloc.o
-	$(CC) fftsg3dt.o fftsg3dpt.o fftsg.o alloc.o -lm -lpthread -o fftsg3dt_pt
-
-
-fftsg2dt.o : fftsg2dt.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c fftsg2dt.c -o fftsg2dt.o
-
-fftsg3dt.o : fftsg3dt.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c fftsg3dt.c -o fftsg3dt.o
-
-
-fftsg2dpt.o : ../fftsg2d.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg2d.c -o fftsg2dpt.o
-
-fftsg3dpt.o : ../fftsg3d.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg3d.c -o fftsg3dpt.o
-
-fftsg.o : ../fftsg.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg.c -o fftsg.o
-
-alloc.o : ../alloc.c
-	$(CC) $(CFLAGS) $(OFLAGS) -c ../alloc.c -o alloc.o
-
-
-
-
-clean:
-	rm -f *.o
-
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/alloc.h b/third_party/tensorflow_dependencies/fft2d/sample2d/alloc.h
deleted file mode 100644
index 3467cc4..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/alloc.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* ---- memory allocation ---- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-
-
-int *alloc_1d_int(int n1);
-void free_1d_int(int *i);
-double *alloc_1d_double(int n1);
-void free_1d_double(double *d);
-int **alloc_2d_int(int n1, int n2);
-void free_2d_int(int **ii);
-double **alloc_2d_double(int n1, int n2);
-void free_2d_double(double **dd);
-int ***alloc_3d_int(int n1, int n2, int n3);
-void free_3d_int(int ***iii);
-double ***alloc_3d_double(int n1, int n2, int n3);
-void free_3d_double(double ***ddd);
-
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/fft4f2dt.c b/third_party/tensorflow_dependencies/fft2d/sample2d/fft4f2dt.c
deleted file mode 100644
index 2f300fd..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/fft4f2dt.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/* test of fft4f2d.c */
-
-#include <math.h>
-#include <stdio.h>
-#include "alloc.h"
-#define MAX(x,y) ((x) > (y) ? (x) : (y))
-
-/* random number generator, 0 <= RND < 1 */
-#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
-
-
-int main()
-{
-    void cdft2d(int, int, int, double **, int *, double *);
-    void rdft2d(int, int, int, double **, int *, double *);
-    void ddct2d(int, int, int, double **, double **, int *, double *);
-    void ddst2d(int, int, int, double **, double **, int *, double *);
-    void putdata2d(int n1, int n2, double **a);
-    double errorcheck2d(int n1, int n2, double scale, double **a);
-    int *ip, n1, n2, n, i;
-    double **a, **t, *w, err;
-
-    printf("data length n1=? (n1 = power of 2) \n");
-    scanf("%d", &n1);
-    printf("data length n2=? (n2 = power of 2) \n");
-    scanf("%d", &n2);
-
-    a = alloc_2d_double(n1, n2);
-    t = alloc_2d_double(n1, n2);
-    n = MAX(n1, n2 / 2);
-    ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
-    n = MAX(n1 / 2, n2 / 4) + MAX(n1, n2);
-    w = alloc_1d_double(n);
-    ip[0] = 0;
-
-    /* check of CDFT */
-    putdata2d(n1, n2, a);
-    cdft2d(n1, n2, 1, a, ip, w);
-    cdft2d(n1, n2, -1, a, ip, w);
-    err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
-    printf("cdft2d err= %g \n", err);
-
-    /* check of RDFT */
-    putdata2d(n1, n2, a);
-    rdft2d(n1, n2, 1, a, ip, w);
-    rdft2d(n1, n2, -1, a, ip, w);
-    err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
-    printf("rdft2d err= %g \n", err);
-
-    /* check of DDCT */
-    putdata2d(n1, n2, a);
-    ddct2d(n1, n2, 1, a, t, ip, w);
-    ddct2d(n1, n2, -1, a, t, ip, w);
-    for (i = 0; i <= n1 - 1; i++) {
-        a[i][0] *= 0.5;
-    }
-    for (i = 0; i <= n2 - 1; i++) {
-        a[0][i] *= 0.5;
-    }
-    err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
-    printf("ddct2d err= %g \n", err);
-
-    /* check of DDST */
-    putdata2d(n1, n2, a);
-    ddst2d(n1, n2, 1, a, t, ip, w);
-    ddst2d(n1, n2, -1, a, t, ip, w);
-    for (i = 0; i <= n1 - 1; i++) {
-        a[i][0] *= 0.5;
-    }
-    for (i = 0; i <= n2 - 1; i++) {
-        a[0][i] *= 0.5;
-    }
-    err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
-    printf("ddst2d err= %g \n", err);
-
-    free_1d_double(w);
-    free_1d_int(ip);
-    free_2d_double(t);
-    free_2d_double(a);
-    return 0;
-}
-
-
-void putdata2d(int n1, int n2, double **a)
-{
-    int j1, j2, seed = 0;
-
-    for (j1 = 0; j1 <= n1 - 1; j1++) {
-        for (j2 = 0; j2 <= n2 - 1; j2++) {
-            a[j1][j2] = RND(&seed);
-        }
-    }
-}
-
-
-double errorcheck2d(int n1, int n2, double scale, double **a)
-{
-    int j1, j2, seed = 0;
-    double err = 0, e;
-
-    for (j1 = 0; j1 <= n1 - 1; j1++) {
-        for (j2 = 0; j2 <= n2 - 1; j2++) {
-            e = RND(&seed) - a[j1][j2] * scale;
-            err = MAX(err, fabs(e));
-        }
-    }
-    return err;
-}
-
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/fft4f2dt.f b/third_party/tensorflow_dependencies/fft2d/sample2d/fft4f2dt.f
deleted file mode 100644
index 43ac8ac..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/fft4f2dt.f
+++ /dev/null
@@ -1,95 +0,0 @@
-! test of fft4f2d.f
-!
-      program main
-      integer nmax, nmaxsqrt
-      parameter (nmax = 1024)
-      parameter (nmaxsqrt = 32)
-      integer ip(0 : nmaxsqrt + 1), n1, n2, i
-      real*8 a(0 : nmax - 1, 0 : nmax - 1), 
-     &    t(0 : nmax - 1, 0 : nmax - 1), w(0 : nmax * 3 / 2 - 1), 
-     &    err, errorcheck2d
-!
-      write (*, *) 'data length n1=? (n1 = power of 2) '
-      read (*, *) n1
-      write (*, *) 'data length n2=? (n2 = power of 2) '
-      read (*, *) n2
-      ip(0) = 0
-!
-!   check of CDFT
-      call putdata2d(nmax, n1, n2, a)
-      call cdft2d(nmax, n1, n2, 1, a, ip, w)
-      call cdft2d(nmax, n1, n2, -1, a, ip, w)
-      err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
-      write (*, *) 'cdft2d err= ', err
-!
-!   check of RDFT
-      call putdata2d(nmax, n1, n2, a)
-      call rdft2d(nmax, n1, n2, 1, a, ip, w)
-      call rdft2d(nmax, n1, n2, -1, a, ip, w)
-      err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
-      write (*, *) 'rdft2d err= ', err
-!
-!   check of DDCT
-      call putdata2d(nmax, n1, n2, a)
-      call ddct2d(nmax, n1, n2, 1, a, t, ip, w)
-      call ddct2d(nmax, n1, n2, -1, a, t, ip, w)
-      do i = 0, n1 - 1
-          a(i, 0) = a(i, 0) * 0.5d0
-      end do
-      do i = 0, n2 - 1
-          a(0, i) = a(0, i) * 0.5d0
-      end do
-      err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
-      write (*, *) 'ddct2d err= ', err
-!
-!   check of DDST
-      call putdata2d(nmax, n1, n2, a)
-      call ddst2d(nmax, n1, n2, 1, a, t, ip, w)
-      call ddst2d(nmax, n1, n2, -1, a, t, ip, w)
-      do i = 0, n1 - 1
-          a(i, 0) = a(i, 0) * 0.5d0
-      end do
-      do i = 0, n2 - 1
-          a(0, i) = a(0, i) * 0.5d0
-      end do
-      err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
-      write (*, *) 'ddst2d err= ', err
-!
-      end
-!
-!
-      subroutine putdata2d(n1max, n1, n2, a)
-      integer n1max, n1, n2, j1, j2, seed
-      real*8 a(0 : n1max - 1, 0 : *), drnd
-      seed = 0
-      do j2 = 0, n2 - 1
-          do j1 = 0, n1 - 1
-              a(j1, j2) = drnd(seed)
-          end do
-      end do
-      end
-!
-!
-      function errorcheck2d(n1max, n1, n2, scale, a)
-      integer n1max, n1, n2, j1, j2, seed
-      real*8 scale, a(0 : n1max - 1, 0 : *), drnd, err, e, 
-     &    errorcheck2d
-      err = 0
-      seed = 0
-      do j2 = 0, n2 - 1
-          do j1 = 0, n1 - 1
-              e = drnd(seed) - a(j1, j2) * scale
-              err = max(err, abs(e))
-          end do
-      end do
-      errorcheck2d = err
-      end
-!
-!
-! random number generator, 0 <= drnd < 1
-      real*8 function drnd(seed)
-      integer seed
-      seed = mod(seed * 7141 + 54773, 259200)
-      drnd = seed * (1.0d0 / 259200)
-      end
-!
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg2dt.c b/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg2dt.c
deleted file mode 100644
index c4cf935..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg2dt.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/* test of fftsg2d.c */
-
-#include <math.h>
-#include <stdio.h>
-#include "alloc.h"
-#define MAX(x,y) ((x) > (y) ? (x) : (y))
-
-/* random number generator, 0 <= RND < 1 */
-#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
-
-
-int main()
-{
-    void cdft2d(int, int, int, double **, double *, int *, double *);
-    void rdft2d(int, int, int, double **, double *, int *, double *);
-    void ddct2d(int, int, int, double **, double *, int *, double *);
-    void ddst2d(int, int, int, double **, double *, int *, double *);
-    void putdata2d(int n1, int n2, double **a);
-    double errorcheck2d(int n1, int n2, double scale, double **a);
-    int *ip, n1, n2, n, i;
-    double **a, *w, err;
-
-    printf("data length n1=? (n1 = power of 2) \n");
-    scanf("%d", &n1);
-    printf("data length n2=? (n2 = power of 2) \n");
-    scanf("%d", &n2);
-
-    a = alloc_2d_double(n1, n2);
-    n = MAX(n1, n2 / 2);
-    ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
-    n = MAX(n1, n2) * 3 / 2;
-    w = alloc_1d_double(n);
-    ip[0] = 0;
-
-    /* check of CDFT */
-    putdata2d(n1, n2, a);
-    cdft2d(n1, n2, 1, a, NULL, ip, w);
-    cdft2d(n1, n2, -1, a, NULL, ip, w);
-    err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
-    printf("cdft2d err= %g \n", err);
-
-    /* check of RDFT */
-    putdata2d(n1, n2, a);
-    rdft2d(n1, n2, 1, a, NULL, ip, w);
-    rdft2d(n1, n2, -1, a, NULL, ip, w);
-    err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
-    printf("rdft2d err= %g \n", err);
-
-    /* check of DDCT */
-    putdata2d(n1, n2, a);
-    ddct2d(n1, n2, 1, a, NULL, ip, w);
-    ddct2d(n1, n2, -1, a, NULL, ip, w);
-    for (i = 0; i <= n1 - 1; i++) {
-        a[i][0] *= 0.5;
-    }
-    for (i = 0; i <= n2 - 1; i++) {
-        a[0][i] *= 0.5;
-    }
-    err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
-    printf("ddct2d err= %g \n", err);
-
-    /* check of DDST */
-    putdata2d(n1, n2, a);
-    ddst2d(n1, n2, 1, a, NULL, ip, w);
-    ddst2d(n1, n2, -1, a, NULL, ip, w);
-    for (i = 0; i <= n1 - 1; i++) {
-        a[i][0] *= 0.5;
-    }
-    for (i = 0; i <= n2 - 1; i++) {
-        a[0][i] *= 0.5;
-    }
-    err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
-    printf("ddst2d err= %g \n", err);
-
-    free_1d_double(w);
-    free_1d_int(ip);
-    free_2d_double(a);
-    return 0;
-}
-
-
-void putdata2d(int n1, int n2, double **a)
-{
-    int j1, j2, seed = 0;
-
-    for (j1 = 0; j1 <= n1 - 1; j1++) {
-        for (j2 = 0; j2 <= n2 - 1; j2++) {
-            a[j1][j2] = RND(&seed);
-        }
-    }
-}
-
-
-double errorcheck2d(int n1, int n2, double scale, double **a)
-{
-    int j1, j2, seed = 0;
-    double err = 0, e;
-
-    for (j1 = 0; j1 <= n1 - 1; j1++) {
-        for (j2 = 0; j2 <= n2 - 1; j2++) {
-            e = RND(&seed) - a[j1][j2] * scale;
-            err = MAX(err, fabs(e));
-        }
-    }
-    return err;
-}
-
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg2dt.f b/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg2dt.f
deleted file mode 100644
index a375c05..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg2dt.f
+++ /dev/null
@@ -1,94 +0,0 @@
-! test of fftsg2d.f
-!
-      program main
-      integer nmax, nmaxsqrt
-      parameter (nmax = 1024)
-      parameter (nmaxsqrt = 32)
-      integer ip(0 : nmaxsqrt + 1), n1, n2, i
-      real*8 a(0 : nmax - 1, 0 : nmax - 1), t(0 : 8 * nmax - 1), 
-     &    w(0 : nmax * 3 / 2 - 1), err, errorcheck2d
-!
-      write (*, *) 'data length n1=? (n1 = power of 2) '
-      read (*, *) n1
-      write (*, *) 'data length n2=? (n2 = power of 2) '
-      read (*, *) n2
-      ip(0) = 0
-!
-!   check of CDFT
-      call putdata2d(nmax, n1, n2, a)
-      call cdft2d(nmax, n1, n2, 1, a, t, ip, w)
-      call cdft2d(nmax, n1, n2, -1, a, t, ip, w)
-      err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
-      write (*, *) 'cdft2d err= ', err
-!
-!   check of RDFT
-      call putdata2d(nmax, n1, n2, a)
-      call rdft2d(nmax, n1, n2, 1, a, t, ip, w)
-      call rdft2d(nmax, n1, n2, -1, a, t, ip, w)
-      err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
-      write (*, *) 'rdft2d err= ', err
-!
-!   check of DDCT
-      call putdata2d(nmax, n1, n2, a)
-      call ddct2d(nmax, n1, n2, 1, a, t, ip, w)
-      call ddct2d(nmax, n1, n2, -1, a, t, ip, w)
-      do i = 0, n1 - 1
-          a(i, 0) = a(i, 0) * 0.5d0
-      end do
-      do i = 0, n2 - 1
-          a(0, i) = a(0, i) * 0.5d0
-      end do
-      err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
-      write (*, *) 'ddct2d err= ', err
-!
-!   check of DDST
-      call putdata2d(nmax, n1, n2, a)
-      call ddst2d(nmax, n1, n2, 1, a, t, ip, w)
-      call ddst2d(nmax, n1, n2, -1, a, t, ip, w)
-      do i = 0, n1 - 1
-          a(i, 0) = a(i, 0) * 0.5d0
-      end do
-      do i = 0, n2 - 1
-          a(0, i) = a(0, i) * 0.5d0
-      end do
-      err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
-      write (*, *) 'ddst2d err= ', err
-!
-      end
-!
-!
-      subroutine putdata2d(n1max, n1, n2, a)
-      integer n1max, n1, n2, j1, j2, seed
-      real*8 a(0 : n1max - 1, 0 : *), drnd
-      seed = 0
-      do j2 = 0, n2 - 1
-          do j1 = 0, n1 - 1
-              a(j1, j2) = drnd(seed)
-          end do
-      end do
-      end
-!
-!
-      function errorcheck2d(n1max, n1, n2, scale, a)
-      integer n1max, n1, n2, j1, j2, seed
-      real*8 scale, a(0 : n1max - 1, 0 : *), drnd, err, e, 
-     &    errorcheck2d
-      err = 0
-      seed = 0
-      do j2 = 0, n2 - 1
-          do j1 = 0, n1 - 1
-              e = drnd(seed) - a(j1, j2) * scale
-              err = max(err, abs(e))
-          end do
-      end do
-      errorcheck2d = err
-      end
-!
-!
-! random number generator, 0 <= drnd < 1
-      real*8 function drnd(seed)
-      integer seed
-      seed = mod(seed * 7141 + 54773, 259200)
-      drnd = seed * (1.0d0 / 259200)
-      end
-!
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg3dt.c b/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg3dt.c
deleted file mode 100644
index 87879e6..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg3dt.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/* test of fftsg3d.c */
-
-#include <math.h>
-#include <stdio.h>
-#include "alloc.h"
-#define MAX(x,y) ((x) > (y) ? (x) : (y))
-
-/* random number generator, 0 <= RND < 1 */
-#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
-
-
-int main()
-{
-    void cdft3d(int, int, int, int, double ***, double *, int *, double *);
-    void rdft3d(int, int, int, int, double ***, double *, int *, double *);
-    void ddct3d(int, int, int, int, double ***, double *, int *, double *);
-    void ddst3d(int, int, int, int, double ***, double *, int *, double *);
-    void putdata3d(int n1, int n2, int n3, double ***a);
-    double errorcheck3d(int n1, int n2, int n3, double scale, double ***a);
-    int *ip, n1, n2, n3, n, nt, i, j;
-    double ***a, *w, err;
-
-    printf("data length n1=? (n1 = power of 2) \n");
-    scanf("%d", &n1);
-    printf("data length n2=? (n2 = power of 2) \n");
-    scanf("%d", &n2);
-    printf("data length n3=? (n3 = power of 2) \n");
-    scanf("%d", &n3);
-
-    a = alloc_3d_double(n1, n2, n3);
-    nt = MAX(n1, n2);
-    n = MAX(nt, n3 / 2);
-    ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
-    n = MAX(nt, n3) * 3 / 2;
-    w = alloc_1d_double(n);
-    ip[0] = 0;
-
-    /* check of CDFT */
-    putdata3d(n1, n2, n3, a);
-    cdft3d(n1, n2, n3, 1, a, NULL, ip, w);
-    cdft3d(n1, n2, n3, -1, a, NULL, ip, w);
-    err = errorcheck3d(n1, n2, n3, 2.0 / n1 / n2 / n3, a);
-    printf("cdft3d err= %g \n", err);
-
-    /* check of RDFT */
-    putdata3d(n1, n2, n3, a);
-    rdft3d(n1, n2, n3, 1, a, NULL, ip, w);
-    rdft3d(n1, n2, n3, -1, a, NULL, ip, w);
-    err = errorcheck3d(n1, n2, n3, 2.0 / n1 / n2 / n3, a);
-    printf("rdft3d err= %g \n", err);
-
-    /* check of DDCT */
-    putdata3d(n1, n2, n3, a);
-    ddct3d(n1, n2, n3, 1, a, NULL, ip, w);
-    ddct3d(n1, n2, n3, -1, a, NULL, ip, w);
-    for (i = 0; i <= n1 - 1; i++) {
-        for (j = 0; j <= n2 - 1; j++) {
-            a[i][j][0] *= 0.5;
-        }
-        for (j = 0; j <= n3 - 1; j++) {
-            a[i][0][j] *= 0.5;
-        }
-    }
-    for (i = 0; i <= n2 - 1; i++) {
-        for (j = 0; j <= n3 - 1; j++) {
-            a[0][i][j] *= 0.5;
-        }
-    }
-    err = errorcheck3d(n1, n2, n3, 8.0 / n1 / n2 / n3, a);
-    printf("ddct3d err= %g \n", err);
-
-    /* check of DDST */
-    putdata3d(n1, n2, n3, a);
-    ddst3d(n1, n2, n3, 1, a, NULL, ip, w);
-    ddst3d(n1, n2, n3, -1, a, NULL, ip, w);
-    for (i = 0; i <= n1 - 1; i++) {
-        for (j = 0; j <= n2 - 1; j++) {
-            a[i][j][0] *= 0.5;
-        }
-        for (j = 0; j <= n3 - 1; j++) {
-            a[i][0][j] *= 0.5;
-        }
-    }
-    for (i = 0; i <= n2 - 1; i++) {
-        for (j = 0; j <= n3 - 1; j++) {
-            a[0][i][j] *= 0.5;
-        }
-    }
-    err = errorcheck3d(n1, n2, n3, 8.0 / n1 / n2 / n3, a);
-    printf("ddst3d err= %g \n", err);
-
-    free_1d_double(w);
-    free_1d_int(ip);
-    free_3d_double(a);
-    return 0;
-}
-
-
-void putdata3d(int n1, int n2, int n3, double ***a)
-{
-    int j1, j2, j3, seed = 0;
-
-    for (j1 = 0; j1 <= n1 - 1; j1++) {
-        for (j2 = 0; j2 <= n2 - 1; j2++) {
-            for (j3 = 0; j3 <= n3 - 1; j3++) {
-                a[j1][j2][j3] = RND(&seed);
-            }
-        }
-    }
-}
-
-
-double errorcheck3d(int n1, int n2, int n3, double scale, double ***a)
-{
-    int j1, j2, j3, seed = 0;
-    double err = 0, e;
-
-    for (j1 = 0; j1 <= n1 - 1; j1++) {
-        for (j2 = 0; j2 <= n2 - 1; j2++) {
-            for (j3 = 0; j3 <= n3 - 1; j3++) {
-                e = RND(&seed) - a[j1][j2][j3] * scale;
-                err = MAX(err, fabs(e));
-            }
-        }
-    }
-    return err;
-}
-
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg3dt.f b/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg3dt.f
deleted file mode 100644
index bbada0d..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/fftsg3dt.f
+++ /dev/null
@@ -1,119 +0,0 @@
-! test of fftsg3d.f
-!
-      program main
-      integer nmax, nmaxsqrt
-      parameter (nmax = 128)
-      parameter (nmaxsqrt = 16)
-      integer ip(0 : nmaxsqrt + 1), n1, n2, n3, i, j
-      real*8 a(0 : nmax - 1, 0 : nmax - 1, 0 : nmax - 1), 
-     &    t(0 : 8 * nmax - 1), 
-     &    w(0 : nmax * 3 / 2 - 1), err, errorcheck3d
-!
-      write (*, *) 'data length n1=? (n1 = power of 2) '
-      read (*, *) n1
-      write (*, *) 'data length n2=? (n2 = power of 2) '
-      read (*, *) n2
-      write (*, *) 'data length n3=? (n3 = power of 2) '
-      read (*, *) n3
-      ip(0) = 0
-!
-!   check of CDFT
-      call putdata3d(nmax, nmax, n1, n2, n3, a)
-      call cdft3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
-      call cdft3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
-      err = errorcheck3d(nmax, nmax, n1, n2, n3, 
-     &    2.0d0 / n1 / n2 / n3, a)
-      write (*, *) 'cdft3d err= ', err
-!
-!   check of RDFT
-      call putdata3d(nmax, nmax, n1, n2, n3, a)
-      call rdft3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
-      call rdft3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
-      err = errorcheck3d(nmax, nmax, n1, n2, n3, 
-     &    2.0d0 / n1 / n2 / n3, a)
-      write (*, *) 'rdft3d err= ', err
-!
-!   check of DDCT
-      call putdata3d(nmax, nmax, n1, n2, n3, a)
-      call ddct3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
-      call ddct3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
-      do j = 0, n2 - 1
-          do i = 0, n1 - 1
-              a(i, j, 0) = a(i, j, 0) * 0.5d0
-          end do
-      end do
-      do j = 0, n3 - 1
-          do i = 0, n1 - 1
-              a(i, 0, j) = a(i, 0, j) * 0.5d0
-          end do
-          do i = 0, n2 - 1
-              a(0, i, j) = a(0, i, j) * 0.5d0
-          end do
-      end do
-      err = errorcheck3d(nmax, nmax, n1, n2, n3, 
-     &    8.0d0 / n1 / n2 / n3, a)
-      write (*, *) 'ddct3d err= ', err
-!
-!   check of DDST
-      call putdata3d(nmax, nmax, n1, n2, n3, a)
-      call ddst3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
-      call ddst3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
-      do j = 0, n2 - 1
-          do i = 0, n1 - 1
-              a(i, j, 0) = a(i, j, 0) * 0.5d0
-          end do
-      end do
-      do j = 0, n3 - 1
-          do i = 0, n1 - 1
-              a(i, 0, j) = a(i, 0, j) * 0.5d0
-          end do
-          do i = 0, n2 - 1
-              a(0, i, j) = a(0, i, j) * 0.5d0
-          end do
-      end do
-      err = errorcheck3d(nmax, nmax, n1, n2, n3, 
-     &    8.0d0 / n1 / n2 / n3, a)
-      write (*, *) 'ddst3d err= ', err
-!
-      end
-!
-!
-      subroutine putdata3d(n1max, n2max, n1, n2, n3, a)
-      integer n1max, n2max, n1, n2, n3, j1, j2, j3, seed
-      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : *), drnd
-      seed = 0
-      do j3 = 0, n3 - 1
-          do j2 = 0, n2 - 1
-              do j1 = 0, n1 - 1
-                  a(j1, j2, j3) = drnd(seed)
-              end do
-          end do
-      end do
-      end
-!
-!
-      function errorcheck3d(n1max, n2max, n1, n2, n3, scale, a)
-      integer n1max, n2max, n1, n2, n3, j1, j2, j3, seed
-      real*8 scale, a(0 : n1max - 1, 0 : n2max - 1, 0 : *), 
-     &    drnd, err, e, errorcheck3d
-      err = 0
-      seed = 0
-      do j3 = 0, n3 - 1
-          do j2 = 0, n2 - 1
-              do j1 = 0, n1 - 1
-                  e = drnd(seed) - a(j1, j2, j3) * scale
-                  err = max(err, abs(e))
-              end do
-          end do
-      end do
-      errorcheck3d = err
-      end
-!
-!
-! random number generator, 0 <= drnd < 1
-      real*8 function drnd(seed)
-      integer seed
-      seed = mod(seed * 7141 + 54773, 259200)
-      drnd = seed * (1.0d0 / 259200)
-      end
-!
diff --git a/third_party/tensorflow_dependencies/fft2d/sample2d/shrtdctt.c b/third_party/tensorflow_dependencies/fft2d/sample2d/shrtdctt.c
deleted file mode 100644
index 70d665f..0000000
--- a/third_party/tensorflow_dependencies/fft2d/sample2d/shrtdctt.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/* test of shrtdct.c */
-
-#include <math.h>
-#include <stdio.h>
-#define MAX(x,y) ((x) > (y) ? (x) : (y))
-
-/* random number generator, 0 <= RND < 1 */
-#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
-
-#define NMAX 16
-
-int main()
-{
-    void ddct8x8s(int isgn, double **a);
-    void ddct16x16s(int isgn, double **a);
-    void putdata2d(int n1, int n2, double **a);
-    double errorcheck2d(int n1, int n2, double scale, double **a);
-    double err;
-    
-    int i;
-    double aarr[NMAX][NMAX], *a[NMAX], barr[NMAX][NMAX], *b[NMAX];
-    for (i = 0; i < NMAX; i++) a[i] = aarr[i];
-    for (i = 0; i < NMAX; i++) b[i] = barr[i];
-
-    /* check of 8x8 DCT */
-    putdata2d(8, 8, a);
-    ddct8x8s(-1, a);
-    ddct8x8s(1, a);
-    err = errorcheck2d(8, 8, 1.0, a);
-    printf("ddct8x8s   err= %g\n", err);
-
-    /* check of 16x16 DCT */
-    putdata2d(16, 16, a);
-    ddct16x16s(-1, a);
-    ddct16x16s(1, a);
-    err = errorcheck2d(16, 16, 1.0, a);
-    printf("ddct16x16s err= %g\n", err);
-
-    return 0;
-}
-
-
-void putdata2d(int n1, int n2, double **a)
-{
-    int j1, j2, seed = 0;
-
-    for (j1 = 0; j1 <= n1 - 1; j1++) {
-        for (j2 = 0; j2 <= n2 - 1; j2++) {
-            a[j1][j2] = RND(&seed);
-        }
-    }
-}
-
-
-double errorcheck2d(int n1, int n2, double scale, double **a)
-{
-    int j1, j2, seed = 0;
-    double err = 0, e;
-
-    for (j1 = 0; j1 <= n1 - 1; j1++) {
-        for (j2 = 0; j2 <= n2 - 1; j2++) {
-            e = RND(&seed) - a[j1][j2] * scale;
-            err = MAX(err, fabs(e));
-        }
-    }
-    return err;
-}
-
diff --git a/third_party/tensorflow_dependencies/fft2d/shrtdct.c b/third_party/tensorflow_dependencies/fft2d/shrtdct.c
deleted file mode 100644
index 455cb4c..0000000
--- a/third_party/tensorflow_dependencies/fft2d/shrtdct.c
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
-Short Discrete Cosine Transform
-    data length :8x8, 16x16
-    method      :row-column, radix 4 FFT
-functions
-    ddct8x8s  : 8x8 DCT
-    ddct16x16s: 16x16 DCT
-function prototypes
-    void ddct8x8s(int isgn, double **a);
-    void ddct16x16s(int isgn, double **a);
-*/
-
-
-/*
--------- 8x8 DCT (Discrete Cosine Transform) / Inverse of DCT --------
-    [definition]
-        <case1> Normalized 8x8 IDCT
-            C[k1][k2] = (1/4) * sum_j1=0^7 sum_j2=0^7 
-                            a[j1][j2] * s[j1] * s[j2] * 
-                            cos(pi*j1*(k1+1/2)/8) * 
-                            cos(pi*j2*(k2+1/2)/8), 0<=k1<8, 0<=k2<8
-                            (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
-        <case2> Normalized 8x8 DCT
-            C[k1][k2] = (1/4) * s[k1] * s[k2] * sum_j1=0^7 sum_j2=0^7 
-                            a[j1][j2] * 
-                            cos(pi*(j1+1/2)*k1/8) * 
-                            cos(pi*(j2+1/2)*k2/8), 0<=k1<8, 0<=k2<8
-                            (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
-    [usage]
-        <case1>
-            ddct8x8s(1, a);
-        <case2>
-            ddct8x8s(-1, a);
-    [parameters]
-        a[0...7][0...7] :input/output data (double **)
-                         output data
-                             a[k1][k2] = C[k1][k2], 0<=k1<8, 0<=k2<8
-*/
-
-
-/* Cn_kR = sqrt(2.0/n) * cos(pi/2*k/n) */
-/* Cn_kI = sqrt(2.0/n) * sin(pi/2*k/n) */
-/* Wn_kR = cos(pi/2*k/n) */
-/* Wn_kI = sin(pi/2*k/n) */
-#define C8_1R   0.49039264020161522456
-#define C8_1I   0.09754516100806413392
-#define C8_2R   0.46193976625564337806
-#define C8_2I   0.19134171618254488586
-#define C8_3R   0.41573480615127261854
-#define C8_3I   0.27778511650980111237
-#define C8_4R   0.35355339059327376220
-#define W8_4R   0.70710678118654752440
-
-
-void ddct8x8s(int isgn, double **a)
-{
-    int j;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    double xr, xi;
-    
-    if (isgn < 0) {
-        for (j = 0; j <= 7; j++) {
-            x0r = a[0][j] + a[7][j];
-            x1r = a[0][j] - a[7][j];
-            x0i = a[2][j] + a[5][j];
-            x1i = a[2][j] - a[5][j];
-            x2r = a[4][j] + a[3][j];
-            x3r = a[4][j] - a[3][j];
-            x2i = a[6][j] + a[1][j];
-            x3i = a[6][j] - a[1][j];
-            xr = x0r + x2r;
-            xi = x0i + x2i;
-            a[0][j] = C8_4R * (xr + xi);
-            a[4][j] = C8_4R * (xr - xi);
-            xr = x0r - x2r;
-            xi = x0i - x2i;
-            a[2][j] = C8_2R * xr - C8_2I * xi;
-            a[6][j] = C8_2R * xi + C8_2I * xr;
-            xr = W8_4R * (x1i - x3i);
-            x1i = W8_4R * (x1i + x3i);
-            x3i = x1i - x3r;
-            x1i += x3r;
-            x3r = x1r - xr;
-            x1r += xr;
-            a[1][j] = C8_1R * x1r - C8_1I * x1i;
-            a[7][j] = C8_1R * x1i + C8_1I * x1r;
-            a[3][j] = C8_3R * x3r - C8_3I * x3i;
-            a[5][j] = C8_3R * x3i + C8_3I * x3r;
-        }
-        for (j = 0; j <= 7; j++) {
-            x0r = a[j][0] + a[j][7];
-            x1r = a[j][0] - a[j][7];
-            x0i = a[j][2] + a[j][5];
-            x1i = a[j][2] - a[j][5];
-            x2r = a[j][4] + a[j][3];
-            x3r = a[j][4] - a[j][3];
-            x2i = a[j][6] + a[j][1];
-            x3i = a[j][6] - a[j][1];
-            xr = x0r + x2r;
-            xi = x0i + x2i;
-            a[j][0] = C8_4R * (xr + xi);
-            a[j][4] = C8_4R * (xr - xi);
-            xr = x0r - x2r;
-            xi = x0i - x2i;
-            a[j][2] = C8_2R * xr - C8_2I * xi;
-            a[j][6] = C8_2R * xi + C8_2I * xr;
-            xr = W8_4R * (x1i - x3i);
-            x1i = W8_4R * (x1i + x3i);
-            x3i = x1i - x3r;
-            x1i += x3r;
-            x3r = x1r - xr;
-            x1r += xr;
-            a[j][1] = C8_1R * x1r - C8_1I * x1i;
-            a[j][7] = C8_1R * x1i + C8_1I * x1r;
-            a[j][3] = C8_3R * x3r - C8_3I * x3i;
-            a[j][5] = C8_3R * x3i + C8_3I * x3r;
-        }
-    } else {
-        for (j = 0; j <= 7; j++) {
-            x1r = C8_1R * a[1][j] + C8_1I * a[7][j];
-            x1i = C8_1R * a[7][j] - C8_1I * a[1][j];
-            x3r = C8_3R * a[3][j] + C8_3I * a[5][j];
-            x3i = C8_3R * a[5][j] - C8_3I * a[3][j];
-            xr = x1r - x3r;
-            xi = x1i + x3i;
-            x1r += x3r;
-            x3i -= x1i;
-            x1i = W8_4R * (xr + xi);
-            x3r = W8_4R * (xr - xi);
-            xr = C8_2R * a[2][j] + C8_2I * a[6][j];
-            xi = C8_2R * a[6][j] - C8_2I * a[2][j];
-            x0r = C8_4R * (a[0][j] + a[4][j]);
-            x0i = C8_4R * (a[0][j] - a[4][j]);
-            x2r = x0r - xr;
-            x2i = x0i - xi;
-            x0r += xr;
-            x0i += xi;
-            a[0][j] = x0r + x1r;
-            a[7][j] = x0r - x1r;
-            a[2][j] = x0i + x1i;
-            a[5][j] = x0i - x1i;
-            a[4][j] = x2r - x3i;
-            a[3][j] = x2r + x3i;
-            a[6][j] = x2i - x3r;
-            a[1][j] = x2i + x3r;
-        }
-        for (j = 0; j <= 7; j++) {
-            x1r = C8_1R * a[j][1] + C8_1I * a[j][7];
-            x1i = C8_1R * a[j][7] - C8_1I * a[j][1];
-            x3r = C8_3R * a[j][3] + C8_3I * a[j][5];
-            x3i = C8_3R * a[j][5] - C8_3I * a[j][3];
-            xr = x1r - x3r;
-            xi = x1i + x3i;
-            x1r += x3r;
-            x3i -= x1i;
-            x1i = W8_4R * (xr + xi);
-            x3r = W8_4R * (xr - xi);
-            xr = C8_2R * a[j][2] + C8_2I * a[j][6];
-            xi = C8_2R * a[j][6] - C8_2I * a[j][2];
-            x0r = C8_4R * (a[j][0] + a[j][4]);
-            x0i = C8_4R * (a[j][0] - a[j][4]);
-            x2r = x0r - xr;
-            x2i = x0i - xi;
-            x0r += xr;
-            x0i += xi;
-            a[j][0] = x0r + x1r;
-            a[j][7] = x0r - x1r;
-            a[j][2] = x0i + x1i;
-            a[j][5] = x0i - x1i;
-            a[j][4] = x2r - x3i;
-            a[j][3] = x2r + x3i;
-            a[j][6] = x2i - x3r;
-            a[j][1] = x2i + x3r;
-        }
-    }
-}
-
-
-
-/*
--------- 16x16 DCT (Discrete Cosine Transform) / Inverse of DCT --------
-    [definition]
-        <case1> Normalized 16x16 IDCT
-            C[k1][k2] = (1/8) * sum_j1=0^15 sum_j2=0^15 
-                            a[j1][j2] * s[j1] * s[j2] * 
-                            cos(pi*j1*(k1+1/2)/16) * 
-                            cos(pi*j2*(k2+1/2)/16), 0<=k1<16, 0<=k2<16
-                            (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
-        <case2> Normalized 16x16 DCT
-            C[k1][k2] = (1/8) * s[k1] * s[k2] * sum_j1=0^15 sum_j2=0^15 
-                            a[j1][j2] * 
-                            cos(pi*(j1+1/2)*k1/16) * 
-                            cos(pi*(j2+1/2)*k2/16), 0<=k1<16, 0<=k2<16
-                            (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
-    [usage]
-        <case1>
-            ddct16x16s(1, a);
-        <case2>
-            ddct16x16s(-1, a);
-    [parameters]
-        a[0...15][0...15] :input/output data (double **)
-                           output data
-                               a[k1][k2] = C[k1][k2], 0<=k1<16, 0<=k2<16
-*/
-
-
-/* Cn_kR = sqrt(2.0/n) * cos(pi/2*k/n) */
-/* Cn_kI = sqrt(2.0/n) * sin(pi/2*k/n) */
-/* Wn_kR = cos(pi/2*k/n) */
-/* Wn_kI = sin(pi/2*k/n) */
-#define C16_1R   0.35185093438159561476
-#define C16_1I   0.03465429229977286565
-#define C16_2R   0.34675996133053686546
-#define C16_2I   0.06897484482073575308
-#define C16_3R   0.33832950029358816957
-#define C16_3I   0.10263113188058934529
-#define C16_4R   0.32664074121909413196
-#define C16_4I   0.13529902503654924610
-#define C16_5R   0.31180625324666780814
-#define C16_5I   0.16666391461943662432
-#define C16_6R   0.29396890060483967924
-#define C16_6I   0.19642373959677554532
-#define C16_7R   0.27330046675043937206
-#define C16_7I   0.22429189658565907106
-#define C16_8R   0.25
-#define W16_4R   0.92387953251128675613
-#define W16_4I   0.38268343236508977173
-#define W16_8R   0.70710678118654752440
-
-
-void ddct16x16s(int isgn, double **a)
-{
-    int j;
-    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-    double x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i;
-    double xr, xi;
-    
-    if (isgn < 0) {
-        for (j = 0; j <= 15; j++) {
-            x4r = a[0][j] - a[15][j];
-            xr = a[0][j] + a[15][j];
-            x4i = a[8][j] - a[7][j];
-            xi = a[8][j] + a[7][j];
-            x0r = xr + xi;
-            x0i = xr - xi;
-            x5r = a[2][j] - a[13][j];
-            xr = a[2][j] + a[13][j];
-            x5i = a[10][j] - a[5][j];
-            xi = a[10][j] + a[5][j];
-            x1r = xr + xi;
-            x1i = xr - xi;
-            x6r = a[4][j] - a[11][j];
-            xr = a[4][j] + a[11][j];
-            x6i = a[12][j] - a[3][j];
-            xi = a[12][j] + a[3][j];
-            x2r = xr + xi;
-            x2i = xr - xi;
-            x7r = a[6][j] - a[9][j];
-            xr = a[6][j] + a[9][j];
-            x7i = a[14][j] - a[1][j];
-            xi = a[14][j] + a[1][j];
-            x3r = xr + xi;
-            x3i = xr - xi;
-            xr = x0r + x2r;
-            xi = x1r + x3r;
-            a[0][j] = C16_8R * (xr + xi);
-            a[8][j] = C16_8R * (xr - xi);
-            xr = x0r - x2r;
-            xi = x1r - x3r;
-            a[4][j] = C16_4R * xr - C16_4I * xi;
-            a[12][j] = C16_4R * xi + C16_4I * xr;
-            x0r = W16_8R * (x1i - x3i);
-            x2r = W16_8R * (x1i + x3i);
-            xr = x0i + x0r;
-            xi = x2r + x2i;
-            a[2][j] = C16_2R * xr - C16_2I * xi;
-            a[14][j] = C16_2R * xi + C16_2I * xr;
-            xr = x0i - x0r;
-            xi = x2r - x2i;
-            a[6][j] = C16_6R * xr - C16_6I * xi;
-            a[10][j] = C16_6R * xi + C16_6I * xr;
-            xr = W16_8R * (x6r - x6i);
-            xi = W16_8R * (x6i + x6r);
-            x6r = x4r - xr;
-            x6i = x4i - xi;
-            x4r += xr;
-            x4i += xi;
-            xr = W16_4I * x7r - W16_4R * x7i;
-            xi = W16_4I * x7i + W16_4R * x7r;
-            x7r = W16_4R * x5r - W16_4I * x5i;
-            x7i = W16_4R * x5i + W16_4I * x5r;
-            x5r = x7r + xr;
-            x5i = x7i + xi;
-            x7r -= xr;
-            x7i -= xi;
-            xr = x4r + x5r;
-            xi = x5i + x4i;
-            a[1][j] = C16_1R * xr - C16_1I * xi;
-            a[15][j] = C16_1R * xi + C16_1I * xr;
-            xr = x4r - x5r;
-            xi = x5i - x4i;
-            a[7][j] = C16_7R * xr - C16_7I * xi;
-            a[9][j] = C16_7R * xi + C16_7I * xr;
-            xr = x6r - x7i;
-            xi = x7r + x6i;
-            a[5][j] = C16_5R * xr - C16_5I * xi;
-            a[11][j] = C16_5R * xi + C16_5I * xr;
-            xr = x6r + x7i;
-            xi = x7r - x6i;
-            a[3][j] = C16_3R * xr - C16_3I * xi;
-            a[13][j] = C16_3R * xi + C16_3I * xr;
-        }
-        for (j = 0; j <= 15; j++) {
-            x4r = a[j][0] - a[j][15];
-            xr = a[j][0] + a[j][15];
-            x4i = a[j][8] - a[j][7];
-            xi = a[j][8] + a[j][7];
-            x0r = xr + xi;
-            x0i = xr - xi;
-            x5r = a[j][2] - a[j][13];
-            xr = a[j][2] + a[j][13];
-            x5i = a[j][10] - a[j][5];
-            xi = a[j][10] + a[j][5];
-            x1r = xr + xi;
-            x1i = xr - xi;
-            x6r = a[j][4] - a[j][11];
-            xr = a[j][4] + a[j][11];
-            x6i = a[j][12] - a[j][3];
-            xi = a[j][12] + a[j][3];
-            x2r = xr + xi;
-            x2i = xr - xi;
-            x7r = a[j][6] - a[j][9];
-            xr = a[j][6] + a[j][9];
-            x7i = a[j][14] - a[j][1];
-            xi = a[j][14] + a[j][1];
-            x3r = xr + xi;
-            x3i = xr - xi;
-            xr = x0r + x2r;
-            xi = x1r + x3r;
-            a[j][0] = C16_8R * (xr + xi);
-            a[j][8] = C16_8R * (xr - xi);
-            xr = x0r - x2r;
-            xi = x1r - x3r;
-            a[j][4] = C16_4R * xr - C16_4I * xi;
-            a[j][12] = C16_4R * xi + C16_4I * xr;
-            x0r = W16_8R * (x1i - x3i);
-            x2r = W16_8R * (x1i + x3i);
-            xr = x0i + x0r;
-            xi = x2r + x2i;
-            a[j][2] = C16_2R * xr - C16_2I * xi;
-            a[j][14] = C16_2R * xi + C16_2I * xr;
-            xr = x0i - x0r;
-            xi = x2r - x2i;
-            a[j][6] = C16_6R * xr - C16_6I * xi;
-            a[j][10] = C16_6R * xi + C16_6I * xr;
-            xr = W16_8R * (x6r - x6i);
-            xi = W16_8R * (x6i + x6r);
-            x6r = x4r - xr;
-            x6i = x4i - xi;
-            x4r += xr;
-            x4i += xi;
-            xr = W16_4I * x7r - W16_4R * x7i;
-            xi = W16_4I * x7i + W16_4R * x7r;
-            x7r = W16_4R * x5r - W16_4I * x5i;
-            x7i = W16_4R * x5i + W16_4I * x5r;
-            x5r = x7r + xr;
-            x5i = x7i + xi;
-            x7r -= xr;
-            x7i -= xi;
-            xr = x4r + x5r;
-            xi = x5i + x4i;
-            a[j][1] = C16_1R * xr - C16_1I * xi;
-            a[j][15] = C16_1R * xi + C16_1I * xr;
-            xr = x4r - x5r;
-            xi = x5i - x4i;
-            a[j][7] = C16_7R * xr - C16_7I * xi;
-            a[j][9] = C16_7R * xi + C16_7I * xr;
-            xr = x6r - x7i;
-            xi = x7r + x6i;
-            a[j][5] = C16_5R * xr - C16_5I * xi;
-            a[j][11] = C16_5R * xi + C16_5I * xr;
-            xr = x6r + x7i;
-            xi = x7r - x6i;
-            a[j][3] = C16_3R * xr - C16_3I * xi;
-            a[j][13] = C16_3R * xi + C16_3I * xr;
-        }
-    } else {
-        for (j = 0; j <= 15; j++) {
-            x5r = C16_1R * a[1][j] + C16_1I * a[15][j];
-            x5i = C16_1R * a[15][j] - C16_1I * a[1][j];
-            xr = C16_7R * a[7][j] + C16_7I * a[9][j];
-            xi = C16_7R * a[9][j] - C16_7I * a[7][j];
-            x4r = x5r + xr;
-            x4i = x5i - xi;
-            x5r -= xr;
-            x5i += xi;
-            x7r = C16_5R * a[5][j] + C16_5I * a[11][j];
-            x7i = C16_5R * a[11][j] - C16_5I * a[5][j];
-            xr = C16_3R * a[3][j] + C16_3I * a[13][j];
-            xi = C16_3R * a[13][j] - C16_3I * a[3][j];
-            x6r = x7r + xr;
-            x6i = x7i - xi;
-            x7r -= xr;
-            x7i += xi;
-            xr = x4r - x6r;
-            xi = x4i - x6i;
-            x4r += x6r;
-            x4i += x6i;
-            x6r = W16_8R * (xi + xr);
-            x6i = W16_8R * (xi - xr);
-            xr = x5r + x7i;
-            xi = x5i - x7r;
-            x5r -= x7i;
-            x5i += x7r;
-            x7r = W16_4I * x5r + W16_4R * x5i;
-            x7i = W16_4I * x5i - W16_4R * x5r;
-            x5r = W16_4R * xr + W16_4I * xi;
-            x5i = W16_4R * xi - W16_4I * xr;
-            xr = C16_4R * a[4][j] + C16_4I * a[12][j];
-            xi = C16_4R * a[12][j] - C16_4I * a[4][j];
-            x2r = C16_8R * (a[0][j] + a[8][j]);
-            x3r = C16_8R * (a[0][j] - a[8][j]);
-            x0r = x2r + xr;
-            x1r = x3r + xi;
-            x2r -= xr;
-            x3r -= xi;
-            x0i = C16_2R * a[2][j] + C16_2I * a[14][j];
-            x2i = C16_2R * a[14][j] - C16_2I * a[2][j];
-            x1i = C16_6R * a[6][j] + C16_6I * a[10][j];
-            x3i = C16_6R * a[10][j] - C16_6I * a[6][j];
-            xr = x0i - x1i;
-            xi = x2i + x3i;
-            x0i += x1i;
-            x2i -= x3i;
-            x1i = W16_8R * (xi + xr);
-            x3i = W16_8R * (xi - xr);
-            xr = x0r + x0i;
-            xi = x0r - x0i;
-            a[0][j] = xr + x4r;
-            a[15][j] = xr - x4r;
-            a[8][j] = xi + x4i;
-            a[7][j] = xi - x4i;
-            xr = x1r + x1i;
-            xi = x1r - x1i;
-            a[2][j] = xr + x5r;
-            a[13][j] = xr - x5r;
-            a[10][j] = xi + x5i;
-            a[5][j] = xi - x5i;
-            xr = x2r + x2i;
-            xi = x2r - x2i;
-            a[4][j] = xr + x6r;
-            a[11][j] = xr - x6r;
-            a[12][j] = xi + x6i;
-            a[3][j] = xi - x6i;
-            xr = x3r + x3i;
-            xi = x3r - x3i;
-            a[6][j] = xr + x7r;
-            a[9][j] = xr - x7r;
-            a[14][j] = xi + x7i;
-            a[1][j] = xi - x7i;
-        }
-        for (j = 0; j <= 15; j++) {
-            x5r = C16_1R * a[j][1] + C16_1I * a[j][15];
-            x5i = C16_1R * a[j][15] - C16_1I * a[j][1];
-            xr = C16_7R * a[j][7] + C16_7I * a[j][9];
-            xi = C16_7R * a[j][9] - C16_7I * a[j][7];
-            x4r = x5r + xr;
-            x4i = x5i - xi;
-            x5r -= xr;
-            x5i += xi;
-            x7r = C16_5R * a[j][5] + C16_5I * a[j][11];
-            x7i = C16_5R * a[j][11] - C16_5I * a[j][5];
-            xr = C16_3R * a[j][3] + C16_3I * a[j][13];
-            xi = C16_3R * a[j][13] - C16_3I * a[j][3];
-            x6r = x7r + xr;
-            x6i = x7i - xi;
-            x7r -= xr;
-            x7i += xi;
-            xr = x4r - x6r;
-            xi = x4i - x6i;
-            x4r += x6r;
-            x4i += x6i;
-            x6r = W16_8R * (xi + xr);
-            x6i = W16_8R * (xi - xr);
-            xr = x5r + x7i;
-            xi = x5i - x7r;
-            x5r -= x7i;
-            x5i += x7r;
-            x7r = W16_4I * x5r + W16_4R * x5i;
-            x7i = W16_4I * x5i - W16_4R * x5r;
-            x5r = W16_4R * xr + W16_4I * xi;
-            x5i = W16_4R * xi - W16_4I * xr;
-            xr = C16_4R * a[j][4] + C16_4I * a[j][12];
-            xi = C16_4R * a[j][12] - C16_4I * a[j][4];
-            x2r = C16_8R * (a[j][0] + a[j][8]);
-            x3r = C16_8R * (a[j][0] - a[j][8]);
-            x0r = x2r + xr;
-            x1r = x3r + xi;
-            x2r -= xr;
-            x3r -= xi;
-            x0i = C16_2R * a[j][2] + C16_2I * a[j][14];
-            x2i = C16_2R * a[j][14] - C16_2I * a[j][2];
-            x1i = C16_6R * a[j][6] + C16_6I * a[j][10];
-            x3i = C16_6R * a[j][10] - C16_6I * a[j][6];
-            xr = x0i - x1i;
-            xi = x2i + x3i;
-            x0i += x1i;
-            x2i -= x3i;
-            x1i = W16_8R * (xi + xr);
-            x3i = W16_8R * (xi - xr);
-            xr = x0r + x0i;
-            xi = x0r - x0i;
-            a[j][0] = xr + x4r;
-            a[j][15] = xr - x4r;
-            a[j][8] = xi + x4i;
-            a[j][7] = xi - x4i;
-            xr = x1r + x1i;
-            xi = x1r - x1i;
-            a[j][2] = xr + x5r;
-            a[j][13] = xr - x5r;
-            a[j][10] = xi + x5i;
-            a[j][5] = xi - x5i;
-            xr = x2r + x2i;
-            xi = x2r - x2i;
-            a[j][4] = xr + x6r;
-            a[j][11] = xr - x6r;
-            a[j][12] = xi + x6i;
-            a[j][3] = xi - x6i;
-            xr = x3r + x3i;
-            xi = x3r - x3i;
-            a[j][6] = xr + x7r;
-            a[j][9] = xr - x7r;
-            a[j][14] = xi + x7i;
-            a[j][1] = xi - x7i;
-        }
-    }
-}
-
diff --git a/third_party/tensorflow_dependencies/flatbuffers b/third_party/tensorflow_dependencies/flatbuffers
deleted file mode 160000
index 6d0aae7..0000000
--- a/third_party/tensorflow_dependencies/flatbuffers
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6d0aae73cd2ad3f8ee93c952af6b33e99049ad9c
diff --git a/third_party/tensorflow_dependencies/fp16 b/third_party/tensorflow_dependencies/fp16
deleted file mode 160000
index 4dfe081..0000000
--- a/third_party/tensorflow_dependencies/fp16
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4dfe081cf6bcd15db339cf2680b9281b8451eeb3
diff --git a/third_party/tensorflow_dependencies/gemmlowp b/third_party/tensorflow_dependencies/gemmlowp
deleted file mode 160000
index fda83bd..0000000
--- a/third_party/tensorflow_dependencies/gemmlowp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit fda83bdc38b118cc6b56753bd540caa49e570745
diff --git a/third_party/tensorflow_dependencies/neon_2_sse b/third_party/tensorflow_dependencies/neon_2_sse
deleted file mode 160000
index 42b2beb..0000000
--- a/third_party/tensorflow_dependencies/neon_2_sse
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 42b2bebacee25452e150095ef4480b3fa26e30f5
diff --git a/third_party/tensorflow_dependencies/ruy b/third_party/tensorflow_dependencies/ruy
deleted file mode 160000
index bebf022..0000000
--- a/third_party/tensorflow_dependencies/ruy
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bebf022784e9b22277b84373c9877aebff8411a7