/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <stddef.h>
#include <stdint.h>

#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"

#include "aom_dsp/aom_dsp_common.h"

// The 2 unused parameters are place holders for PIC enabled build.
// These definitions are for functions defined in subpel_variance.asm
#define DECL(w, opt)                                                           \
  int aom_sub_pixel_variance##w##xh_##opt(                                     \
      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
      void *unused0, void *unused)
#define DECLS(opt) \
  DECL(4, opt);    \
  DECL(8, opt);    \
  DECL(16, opt)

DECLS(ssse3);
#undef DECLS
#undef DECL

#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
    /*Avoid overflow in helper by capping height.*/                           \
    const int hf = AOMMIN(h, 64);                                             \
    unsigned int sse = 0;                                                     \
    int se = 0;                                                               \
    for (int i = 0; i < (w / wf); ++i) {                                      \
      const uint8_t *src_ptr = src;                                           \
      const uint8_t *dst_ptr = dst;                                           \
      for (int j = 0; j < (h / hf); ++j) {                                    \
        unsigned int sse2;                                                    \
        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
            &sse2, NULL, NULL);                                               \
        dst_ptr += hf * dst_stride;                                           \
        src_ptr += hf * src_stride;                                           \
        se += se2;                                                            \
        sse += sse2;                                                          \
      }                                                                       \
      src += wf;                                                              \
      dst += wf;                                                              \
    }                                                                         \
    *sse_ptr = sse;                                                           \
    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
  }

#if !CONFIG_REALTIME_ONLY
#define FNS(opt)                                    \
  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
#else
#define FNS(opt)                                    \
  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
#endif

FNS(ssse3)

#undef FNS
#undef FN

// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt)                                                        \
  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
      void *unused)
#define DECLS(opt) \
  DECL(4, opt);    \
  DECL(8, opt);    \
  DECL(16, opt)

DECLS(ssse3);
#undef DECL
#undef DECLS

#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
      const uint8_t *sec) {                                                  \
    /*Avoid overflow in helper by capping height.*/                          \
    const int hf = AOMMIN(h, 64);                                            \
    unsigned int sse = 0;                                                    \
    int se = 0;                                                              \
    for (int i = 0; i < (w / wf); ++i) {                                     \
      const uint8_t *src_ptr = src;                                          \
      const uint8_t *dst_ptr = dst;                                          \
      const uint8_t *sec_ptr = sec;                                          \
      for (int j = 0; j < (h / hf); ++j) {                                   \
        unsigned int sse2;                                                   \
        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
        dst_ptr += hf * dst_stride;                                          \
        src_ptr += hf * src_stride;                                          \
        sec_ptr += hf * w;                                                   \
        se += se2;                                                           \
        sse += sse2;                                                         \
      }                                                                      \
      src += wf;                                                             \
      dst += wf;                                                             \
      sec += wf;                                                             \
    }                                                                        \
    *sse_ptr = sse;                                                          \
    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
  }

#if !CONFIG_REALTIME_ONLY
#define FNS(opt)                                    \
  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
#else
#define FNS(opt)                                    \
  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
#endif

FNS(ssse3)

#undef FNS
#undef FN
