av1/common/idct.c - aom - Git at Google

 /*
  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <math.h>

 #include "./aom_dsp_rtcd.h"
 #include "./av1_rtcd.h"
 #include "aom_dsp/inv_txfm.h"
 #include "aom_ports/mem.h"
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
 #include "av1/common/idct.h"

 int av1_get_tx_scale(const TX_SIZE tx_size) {
   const int pels = tx_size_2d[tx_size];
   // Largest possible pels is 4096 (64x64).
   return (pels > 256) + (pels > 1024);
 }

 // NOTE: The implementation of all inverses need to be aware of the fact
 // that input and output could be the same buffer.

 static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
   for (int i = 0; i < 4; ++i) {
     output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
   }
 }

 #define FLIPUD_PTR(dest, stride, size)       \
   do {                                       \
     (dest) = (dest) + ((size)-1) * (stride); \
     (stride) = -(stride);                    \
   } while (0)

 static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
                                int *sstride, TX_TYPE tx_type, int sizey,
                                int sizex) {
   // Note that the transpose of src will be added to dst. In order to LR
   // flip the addends (in dst coordinates), we UD flip the src. To UD flip
   // the addends, we UD flip the dst.
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
     case IDTX:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST: break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST:
       // flip UD
       FLIPUD_PTR(*dst, *dstride, sizey);
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
     case H_FLIPADST:
       // flip LR
       FLIPUD_PTR(*src, *sstride, sizex);
       break;
     case FLIPADST_FLIPADST:
       // flip UD
       FLIPUD_PTR(*dst, *dstride, sizey);
       // flip LR
       FLIPUD_PTR(*src, *sstride, sizex);
       break;
     default: assert(0); break;
   }
 }

 static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bsx, int bsy, TX_TYPE tx_type,
                                   int bd) {
   const int pels = bsx * bsy;
   const int shift = 3 - ((pels > 256) + (pels > 1024));
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   if (tx_type == IDTX) {
     for (int r = 0; r < bsy; ++r) {
       for (int c = 0; c < bsx; ++c)
         dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
       dest += stride;
       input += bsx;
     }
   }
 }

 void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   if (tx_type == DCT_DCT) {
     aom_idct4x4_16_add(input, dest, stride);
     return;
   }
   static const transform_2d IHT_4[] = {
     { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
     { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
     { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
     { aom_iadst4_c, aom_iadst4_c },  // ADST_ADST = 3
     { aom_iadst4_c, aom_idct4_c },   // FLIPADST_DCT
     { aom_idct4_c, aom_iadst4_c },   // DCT_FLIPADST
     { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_FLIPADST
     { aom_iadst4_c, aom_iadst4_c },  // ADST_FLIPADST
     { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_ADST
     { iidtx4_c, iidtx4_c },          // IDTX
     { aom_idct4_c, iidtx4_c },       // V_DCT
     { iidtx4_c, aom_idct4_c },       // H_DCT
     { aom_iadst4_c, iidtx4_c },      // V_ADST
     { iidtx4_c, aom_iadst4_c },      // H_ADST
     { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
     { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
   };

   tran_low_t tmp[4][4];
   tran_low_t out[4][4];
   tran_low_t *outp = &out[0][0];
   int outstride = 4;

   // inverse transform row vectors
   for (int i = 0; i < 4; ++i) {
     IHT_4[tx_type].rows(input, out[i]);
     input += 4;
   }

   // transpose
   for (int i = 0; i < 4; i++) {
     for (int j = 0; j < 4; j++) {
       tmp[j][i] = out[i][j];
     }
   }

   // inverse transform column vectors
   for (int i = 0; i < 4; ++i) {
     IHT_4[tx_type].cols(tmp[i], out[i]);
   }

   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);

   // Sum with the destination
   for (int i = 0; i < 4; ++i) {
     for (int j = 0; j < 4; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
     }
   }
 }

 // idct
 static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest,
                                int stride, int eob, int bd) {
   if (eob > 1)
     aom_highbd_iwht4x4_16_add(input, dest, stride, bd);
   else
     aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }

 static const int32_t *cast_to_int32(const tran_low_t *input) {
   assert(sizeof(int32_t) == sizeof(tran_low_t));
   return (const int32_t *)input;
 }

 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   int eob = txfm_param->eob;
   int bd = txfm_param->bd;
   int lossless = txfm_param->lossless;
   const int32_t *src = cast_to_int32(input);
   const TX_TYPE tx_type = txfm_param->tx_type;
   if (lossless) {
     assert(tx_type == DCT_DCT);
     highbd_iwht4x4_add(input, dest, stride, eob, bd);
     return;
   }
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
       av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                              bd);
       break;
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
       av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                              bd);
       break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
     default: assert(0); break;
   }
 }

 static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
                                     int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                            txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
                                     int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                            txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }

 static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                     int stride, const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
       av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                              bd);
       break;
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
       av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                              bd);
       break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
     default: assert(0);
   }
 }

 static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
       av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
       av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  tx_type, bd);
       break;
     default: assert(0);
   }
 }

 static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
     case DCT_DCT:
       av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;

     // The optimised version only supports DCT_DCT, so force use of
     // the C version for all other transform types.
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case IDTX:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
       av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  tx_type, bd);
       break;

     default: assert(0);
   }
 }

 static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
     case DCT_DCT:
       av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, DCT_DCT,
                                bd);
       break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
       // TODO(sarahparker)
       // I've deleted the 64x64 implementations that existed in lieu
       // of adst, flipadst and identity for simplicity but will bring back
       // in a later change. This shouldn't impact performance since
       // DCT_DCT is the only extended type currently allowed for 64x64,
       // as dictated by get_ext_tx_set_type in blockd.h.
       av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  DCT_DCT, bd);
       break;
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 64, 64, tx_type, bd);
       break;
     default: assert(0); break;
   }
 }

 static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
                             TX_TYPE tx_type, int eob, int reduced_tx_set,
                             TxfmParam *txfm_param) {
   txfm_param->tx_type = tx_type;
   txfm_param->tx_size = tx_size;
   txfm_param->eob = eob;
   txfm_param->lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
   txfm_param->bd = xd->bd;
   txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(xd->mi[0]->mbmi.sb_type, pd);
   txfm_param->tx_set_type =
       get_ext_tx_set_type(txfm_param->tx_size, plane_bsize,
                           is_inter_block(&xd->mi[0]->mbmi), reduced_tx_set);
 }

 static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
                                 int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
     case TX_32X32:
       highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
       break;
     case TX_16X16:
       highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param);
       break;
     case TX_8X8:
       highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
       highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
       highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
       break;
     case TX_8X16:
       highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
       break;
     case TX_16X8:
       highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param);
       break;
     case TX_16X32:
       highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
       break;
     case TX_32X16:
       highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
       break;
     case TX_64X64:
       highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
       break;
     case TX_32X64:
       highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
       break;
     case TX_64X32:
       highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
       break;
     case TX_16X64:
       highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
       break;
     case TX_64X16:
       highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
       break;
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
       av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
       break;
     case TX_16X4:
       highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
       break;
     case TX_4X16:
       highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
       break;
     case TX_8X32:
       highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
       break;
     case TX_32X8:
       highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
       break;
     default: assert(0 && "Invalid transform size"); break;
   }
 }

 void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                         const TxfmParam *txfm_param) {
   const TX_SIZE tx_size = txfm_param->tx_size;
   DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
   int tmp_stride = MAX_TX_SIZE;
   int w = tx_size_wide[tx_size];
   int h = tx_size_high[tx_size];
   for (int r = 0; r < h; ++r) {
     for (int c = 0; c < w; ++c) {
       tmp[r * tmp_stride + c] = dst[r * stride + c];
     }
   }

   highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param);

   for (int r = 0; r < h; ++r) {
     for (int c = 0; c < w; ++c) {
       dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
     }
   }
 }

 void av1_inverse_transform_block(const MACROBLOCKD *xd,
                                  const tran_low_t *dqcoeff, int plane,
                                  TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
                                  int stride, int eob, int reduced_tx_set) {
   if (!eob) return;

   assert(eob <= av1_get_max_eob(tx_size));

   TxfmParam txfm_param;
   init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
                   &txfm_param);
   assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);

   if (txfm_param.is_hbd) {
     highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
   } else {
     av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
   }
 }
	/*
	* Copyright (c) 2016, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <math.h>

	#include "./aom_dsp_rtcd.h"
	#include "./av1_rtcd.h"
	#include "aom_dsp/inv_txfm.h"
	#include "aom_ports/mem.h"
	#include "av1/common/av1_inv_txfm1d_cfg.h"
	#include "av1/common/blockd.h"
	#include "av1/common/enums.h"
	#include "av1/common/idct.h"

	int av1_get_tx_scale(const TX_SIZE tx_size) {
	const int pels = tx_size_2d[tx_size];
	// Largest possible pels is 4096 (64x64).
	return (pels > 256) + (pels > 1024);
	}

	// NOTE: The implementation of all inverses need to be aware of the fact
	// that input and output could be the same buffer.

	static void iidtx4_c(const tran_low_t input, tran_low_t output) {
	for (int i = 0; i < 4; ++i) {
	output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
	}
	}

	#define FLIPUD_PTR(dest, stride, size) \
	do { \
	(dest) = (dest) + ((size)-1) * (stride); \
	(stride) = -(stride); \
	} while (0)

	static void maybe_flip_strides(uint8_t *dst, int dstride, tran_low_t **src,
	int *sstride, TX_TYPE tx_type, int sizey,
	int sizex) {
	// Note that the transpose of src will be added to dst. In order to LR
	// flip the addends (in dst coordinates), we UD flip the src. To UD flip
	// the addends, we UD flip the dst.
	switch (tx_type) {
	case DCT_DCT:
	case ADST_DCT:
	case DCT_ADST:
	case ADST_ADST:
	case IDTX:
	case V_DCT:
	case H_DCT:
	case V_ADST:
	case H_ADST: break;
	case FLIPADST_DCT:
	case FLIPADST_ADST:
	case V_FLIPADST:
	// flip UD
	FLIPUD_PTR(dst, dstride, sizey);
	break;
	case DCT_FLIPADST:
	case ADST_FLIPADST:
	case H_FLIPADST:
	// flip LR
	FLIPUD_PTR(src, sstride, sizex);
	break;
	case FLIPADST_FLIPADST:
	// flip UD
	FLIPUD_PTR(dst, dstride, sizey);
	// flip LR
	FLIPUD_PTR(src, sstride, sizex);
	break;
	default: assert(0); break;
	}
	}

	static void highbd_inv_idtx_add_c(const tran_low_t input, uint8_t dest8,
	int stride, int bsx, int bsy, TX_TYPE tx_type,
	int bd) {
	const int pels = bsx * bsy;
	const int shift = 3 - ((pels > 256) + (pels > 1024));
	uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	if (tx_type == IDTX) {
	for (int r = 0; r < bsy; ++r) {
	for (int c = 0; c < bsx; ++c)
	dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
	dest += stride;
	input += bsx;
	}
	}
	}

	void av1_iht4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride,
	const TxfmParam *txfm_param) {
	const TX_TYPE tx_type = txfm_param->tx_type;
	if (tx_type == DCT_DCT) {
	aom_idct4x4_16_add(input, dest, stride);
	return;
	}
	static const transform_2d IHT_4[] = {
	{ aom_idct4_c, aom_idct4_c }, // DCT_DCT = 0
	{ aom_iadst4_c, aom_idct4_c }, // ADST_DCT = 1
	{ aom_idct4_c, aom_iadst4_c }, // DCT_ADST = 2
	{ aom_iadst4_c, aom_iadst4_c }, // ADST_ADST = 3
	{ aom_iadst4_c, aom_idct4_c }, // FLIPADST_DCT
	{ aom_idct4_c, aom_iadst4_c }, // DCT_FLIPADST
	{ aom_iadst4_c, aom_iadst4_c }, // FLIPADST_FLIPADST
	{ aom_iadst4_c, aom_iadst4_c }, // ADST_FLIPADST
	{ aom_iadst4_c, aom_iadst4_c }, // FLIPADST_ADST
	{ iidtx4_c, iidtx4_c }, // IDTX
	{ aom_idct4_c, iidtx4_c }, // V_DCT
	{ iidtx4_c, aom_idct4_c }, // H_DCT
	{ aom_iadst4_c, iidtx4_c }, // V_ADST
	{ iidtx4_c, aom_iadst4_c }, // H_ADST
	{ aom_iadst4_c, iidtx4_c }, // V_FLIPADST
	{ iidtx4_c, aom_iadst4_c }, // H_FLIPADST
	};

	tran_low_t tmp[4][4];
	tran_low_t out[4][4];
	tran_low_t *outp = &out[0][0];
	int outstride = 4;

	// inverse transform row vectors
	for (int i = 0; i < 4; ++i) {
	IHT_4[tx_type].rows(input, out[i]);
	input += 4;
	}

	// transpose
	for (int i = 0; i < 4; i++) {
	for (int j = 0; j < 4; j++) {
	tmp[j][i] = out[i][j];
	}
	}

	// inverse transform column vectors
	for (int i = 0; i < 4; ++i) {
	IHT_4[tx_type].cols(tmp[i], out[i]);
	}

	maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);

	// Sum with the destination
	for (int i = 0; i < 4; ++i) {
	for (int j = 0; j < 4; ++j) {
	int d = i * stride + j;
	int s = j * outstride + i;
	dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
	}
	}
	}

	// idct
	static void highbd_iwht4x4_add(const tran_low_t input, uint8_t dest,
	int stride, int eob, int bd) {
	if (eob > 1)
	aom_highbd_iwht4x4_16_add(input, dest, stride, bd);
	else
	aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
	}

	static const int32_t cast_to_int32(const tran_low_t input) {
	assert(sizeof(int32_t) == sizeof(tran_low_t));
	return (const int32_t *)input;
	}

	void av1_highbd_inv_txfm_add_4x4(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
	int eob = txfm_param->eob;
	int bd = txfm_param->bd;
	int lossless = txfm_param->lossless;
	const int32_t *src = cast_to_int32(input);
	const TX_TYPE tx_type = txfm_param->tx_type;
	if (lossless) {
	assert(tx_type == DCT_DCT);
	highbd_iwht4x4_add(input, dest, stride, eob, bd);
	return;
	}
	switch (tx_type) {
	case DCT_DCT:
	case ADST_DCT:
	case DCT_ADST:
	case ADST_ADST:
	av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;
	case FLIPADST_DCT:
	case DCT_FLIPADST:
	case FLIPADST_FLIPADST:
	case ADST_FLIPADST:
	case FLIPADST_ADST:
	av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;
	// use the c version for anything including identity for now
	case V_DCT:
	case H_DCT:
	case V_ADST:
	case H_ADST:
	case V_FLIPADST:
	case H_FLIPADST:
	case IDTX:
	av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;
	default: assert(0); break;
	}
	}

	static void highbd_inv_txfm_add_4x8(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_8x4(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_8x16(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_16x8(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_16x32(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_32x16(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_16x4(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_4x16(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_32x8(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_8x32(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_32x64(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_64x32(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_16x64(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_64x16(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	const int32_t *src = cast_to_int32(input);
	av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	txfm_param->tx_type, txfm_param->bd);
	}

	static void highbd_inv_txfm_add_8x8(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	int bd = txfm_param->bd;
	const TX_TYPE tx_type = txfm_param->tx_type;
	const int32_t *src = cast_to_int32(input);
	switch (tx_type) {
	case DCT_DCT:
	case ADST_DCT:
	case DCT_ADST:
	case ADST_ADST:
	av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;
	case FLIPADST_DCT:
	case DCT_FLIPADST:
	case FLIPADST_FLIPADST:
	case ADST_FLIPADST:
	case FLIPADST_ADST:
	av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;
	// use the c version for anything including identity for now
	case V_DCT:
	case H_DCT:
	case V_ADST:
	case H_ADST:
	case V_FLIPADST:
	case H_FLIPADST:
	case IDTX:
	av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;
	default: assert(0);
	}
	}

	static void highbd_inv_txfm_add_16x16(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	int bd = txfm_param->bd;
	const TX_TYPE tx_type = txfm_param->tx_type;
	const int32_t *src = cast_to_int32(input);
	switch (tx_type) {
	case DCT_DCT:
	case ADST_DCT:
	case DCT_ADST:
	case ADST_ADST:
	av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;
	case FLIPADST_DCT:
	case DCT_FLIPADST:
	case FLIPADST_FLIPADST:
	case ADST_FLIPADST:
	case FLIPADST_ADST:
	av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;
	// use the c version for anything including identity for now
	case V_DCT:
	case H_DCT:
	case V_ADST:
	case H_ADST:
	case V_FLIPADST:
	case H_FLIPADST:
	case IDTX:
	av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	tx_type, bd);
	break;
	default: assert(0);
	}
	}

	static void highbd_inv_txfm_add_32x32(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	int bd = txfm_param->bd;
	const TX_TYPE tx_type = txfm_param->tx_type;
	const int32_t *src = cast_to_int32(input);
	switch (tx_type) {
	case DCT_DCT:
	av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
	bd);
	break;

	// The optimised version only supports DCT_DCT, so force use of
	// the C version for all other transform types.
	case ADST_DCT:
	case DCT_ADST:
	case ADST_ADST:
	case FLIPADST_DCT:
	case DCT_FLIPADST:
	case FLIPADST_FLIPADST:
	case ADST_FLIPADST:
	case FLIPADST_ADST:
	case IDTX:
	case V_DCT:
	case H_DCT:
	case V_ADST:
	case H_ADST:
	case V_FLIPADST:
	case H_FLIPADST:
	av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	tx_type, bd);
	break;

	default: assert(0);
	}
	}

	static void highbd_inv_txfm_add_64x64(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	int bd = txfm_param->bd;
	const TX_TYPE tx_type = txfm_param->tx_type;
	const int32_t *src = cast_to_int32(input);
	switch (tx_type) {
	case DCT_DCT:
	av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, DCT_DCT,
	bd);
	break;
	case ADST_DCT:
	case DCT_ADST:
	case ADST_ADST:
	case FLIPADST_DCT:
	case DCT_FLIPADST:
	case FLIPADST_FLIPADST:
	case ADST_FLIPADST:
	case FLIPADST_ADST:
	case V_DCT:
	case H_DCT:
	case V_ADST:
	case H_ADST:
	case V_FLIPADST:
	case H_FLIPADST:
	// TODO(sarahparker)
	// I've deleted the 64x64 implementations that existed in lieu
	// of adst, flipadst and identity for simplicity but will bring back
	// in a later change. This shouldn't impact performance since
	// DCT_DCT is the only extended type currently allowed for 64x64,
	// as dictated by get_ext_tx_set_type in blockd.h.
	av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
	DCT_DCT, bd);
	break;
	case IDTX:
	highbd_inv_idtx_add_c(input, dest, stride, 64, 64, tx_type, bd);
	break;
	default: assert(0); break;
	}
	}

	static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
	TX_TYPE tx_type, int eob, int reduced_tx_set,
	TxfmParam *txfm_param) {
	txfm_param->tx_type = tx_type;
	txfm_param->tx_size = tx_size;
	txfm_param->eob = eob;
	txfm_param->lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
	txfm_param->bd = xd->bd;
	txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
	const struct macroblockd_plane *const pd = &xd->plane[plane];
	const BLOCK_SIZE plane_bsize =
	get_plane_block_size(xd->mi[0]->mbmi.sb_type, pd);
	txfm_param->tx_set_type =
	get_ext_tx_set_type(txfm_param->tx_size, plane_bsize,
	is_inter_block(&xd->mi[0]->mbmi), reduced_tx_set);
	}

	static void highbd_inv_txfm_add(const tran_low_t input, uint8_t dest,
	int stride, const TxfmParam *txfm_param) {
	assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
	const TX_SIZE tx_size = txfm_param->tx_size;
	switch (tx_size) {
	case TX_32X32:
	highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
	break;
	case TX_16X16:
	highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param);
	break;
	case TX_8X8:
	highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
	break;
	case TX_4X8:
	highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
	break;
	case TX_8X4:
	highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
	break;
	case TX_8X16:
	highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
	break;
	case TX_16X8:
	highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param);
	break;
	case TX_16X32:
	highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
	break;
	case TX_32X16:
	highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
	break;
	case TX_64X64:
	highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
	break;
	case TX_32X64:
	highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
	break;
	case TX_64X32:
	highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
	break;
	case TX_16X64:
	highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
	break;
	case TX_64X16:
	highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
	break;
	case TX_4X4:
	// this is like av1_short_idct4x4 but has a special case around eob<=1
	// which is significant (not just an optimization) for the lossless
	// case.
	av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
	break;
	case TX_16X4:
	highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
	break;
	case TX_4X16:
	highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
	break;
	case TX_8X32:
	highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
	break;
	case TX_32X8:
	highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
	break;
	default: assert(0 && "Invalid transform size"); break;
	}
	}

	void av1_inv_txfm_add_c(const tran_low_t dqcoeff, uint8_t dst, int stride,
	const TxfmParam *txfm_param) {
	const TX_SIZE tx_size = txfm_param->tx_size;
	DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
	int tmp_stride = MAX_TX_SIZE;
	int w = tx_size_wide[tx_size];
	int h = tx_size_high[tx_size];
	for (int r = 0; r < h; ++r) {
	for (int c = 0; c < w; ++c) {
	tmp[r * tmp_stride + c] = dst[r * stride + c];
	}
	}

	highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param);

	for (int r = 0; r < h; ++r) {
	for (int c = 0; c < w; ++c) {
	dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
	}
	}
	}

	void av1_inverse_transform_block(const MACROBLOCKD *xd,
	const tran_low_t *dqcoeff, int plane,
	TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
	int stride, int eob, int reduced_tx_set) {
	if (!eob) return;

	assert(eob <= av1_get_max_eob(tx_size));

	TxfmParam txfm_param;
	init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
	&txfm_param);
	assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);

	if (txfm_param.is_hbd) {
	highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
	} else {
	av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
	}
	}