blob: 18a70dffe73cde376d4aa4a606f3be8d6f2af077 [file] [log] [blame]
Johannc3bdffb2015-05-15 11:52:03 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Johannc3bdffb2015-05-15 11:52:03 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Johannc3bdffb2015-05-15 11:52:03 -070010 */
Yi Luoe9fde262016-10-07 15:02:33 -070011
12#include <immintrin.h>
Yaowu Xuf883b422016-08-30 14:01:10 -070013#include "./aom_dsp_rtcd.h"
Johannc3bdffb2015-05-15 11:52:03 -070014
15typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
16 const uint8_t *ref, int ref_stride,
17 unsigned int *sse, int *sum);
18
Yaowu Xuf883b422016-08-30 14:01:10 -070019void aom_get32x32var_avx2(const uint8_t *src, int src_stride,
clang-format1214cee2016-08-08 22:59:08 -070020 const uint8_t *ref, int ref_stride, unsigned int *sse,
21 int *sum);
Johannc3bdffb2015-05-15 11:52:03 -070022
23static void variance_avx2(const uint8_t *src, int src_stride,
clang-format1214cee2016-08-08 22:59:08 -070024 const uint8_t *ref, int ref_stride, int w, int h,
25 unsigned int *sse, int *sum, get_var_avx2 var_fn,
26 int block_size) {
Johannc3bdffb2015-05-15 11:52:03 -070027 int i, j;
28
29 *sse = 0;
30 *sum = 0;
31
32 for (i = 0; i < h; i += 16) {
33 for (j = 0; j < w; j += block_size) {
34 unsigned int sse0;
35 int sum0;
clang-format1214cee2016-08-08 22:59:08 -070036 var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
37 ref_stride, &sse0, &sum0);
Johannc3bdffb2015-05-15 11:52:03 -070038 *sse += sse0;
39 *sum += sum0;
40 }
41 }
42}
43
Yaowu Xuf883b422016-08-30 14:01:10 -070044unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
Johannc3bdffb2015-05-15 11:52:03 -070045 const uint8_t *ref, int ref_stride,
46 unsigned int *sse) {
47 int sum;
Yi Luoe9fde262016-10-07 15:02:33 -070048 unsigned int variance;
clang-format1214cee2016-08-08 22:59:08 -070049 variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -070050 aom_get16x16var_avx2, 16);
Yi Luoe9fde262016-10-07 15:02:33 -070051
52 variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
53 _mm256_zeroupper();
54 return variance;
Johannc3bdffb2015-05-15 11:52:03 -070055}
56
Yaowu Xuf883b422016-08-30 14:01:10 -070057unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
Johannc3bdffb2015-05-15 11:52:03 -070058 const uint8_t *ref, int ref_stride,
59 unsigned int *sse) {
60 int sum;
Yaowu Xuf883b422016-08-30 14:01:10 -070061 aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
Yi Luoe9fde262016-10-07 15:02:33 -070062 _mm256_zeroupper();
Johannc3bdffb2015-05-15 11:52:03 -070063 return *sse;
64}
65
Yaowu Xuf883b422016-08-30 14:01:10 -070066unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
Johannc3bdffb2015-05-15 11:52:03 -070067 const uint8_t *ref, int ref_stride,
68 unsigned int *sse) {
69 int sum;
Yi Luoe9fde262016-10-07 15:02:33 -070070 unsigned int variance;
clang-format1214cee2016-08-08 22:59:08 -070071 variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -070072 aom_get32x32var_avx2, 32);
Yi Luoe9fde262016-10-07 15:02:33 -070073
74 variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
75 _mm256_zeroupper();
76 return variance;
Johannc3bdffb2015-05-15 11:52:03 -070077}
78
Yaowu Xuf883b422016-08-30 14:01:10 -070079unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
Johannc3bdffb2015-05-15 11:52:03 -070080 const uint8_t *ref, int ref_stride,
81 unsigned int *sse) {
82 int sum;
Yi Luoe9fde262016-10-07 15:02:33 -070083 unsigned int variance;
clang-format1214cee2016-08-08 22:59:08 -070084 variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -070085 aom_get32x32var_avx2, 32);
Yi Luoe9fde262016-10-07 15:02:33 -070086
87 variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
88 _mm256_zeroupper();
89 return variance;
Johannc3bdffb2015-05-15 11:52:03 -070090}
91
Yaowu Xuf883b422016-08-30 14:01:10 -070092unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
Johannc3bdffb2015-05-15 11:52:03 -070093 const uint8_t *ref, int ref_stride,
94 unsigned int *sse) {
95 int sum;
Yi Luoe9fde262016-10-07 15:02:33 -070096 unsigned int variance;
clang-format1214cee2016-08-08 22:59:08 -070097 variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -070098 aom_get32x32var_avx2, 32);
Yi Luoe9fde262016-10-07 15:02:33 -070099
100 variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
101 _mm256_zeroupper();
102 return variance;
Johannc3bdffb2015-05-15 11:52:03 -0700103}
104
Yaowu Xuf883b422016-08-30 14:01:10 -0700105unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
Johannc3bdffb2015-05-15 11:52:03 -0700106 const uint8_t *ref, int ref_stride,
107 unsigned int *sse) {
108 int sum;
Yi Luoe9fde262016-10-07 15:02:33 -0700109 unsigned int variance;
clang-format1214cee2016-08-08 22:59:08 -0700110 variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700111 aom_get32x32var_avx2, 32);
Yi Luoe9fde262016-10-07 15:02:33 -0700112
113 variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
114 _mm256_zeroupper();
115 return variance;
Johannc3bdffb2015-05-15 11:52:03 -0700116}
Johann6a82f0d2015-06-05 09:54:19 -0700117
Yaowu Xuf883b422016-08-30 14:01:10 -0700118unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
Johann6a82f0d2015-06-05 09:54:19 -0700119 int x_offset, int y_offset,
120 const uint8_t *dst, int dst_stride,
clang-format1214cee2016-08-08 22:59:08 -0700121 int height, unsigned int *sse);
Johann6a82f0d2015-06-05 09:54:19 -0700122
Yaowu Xuf883b422016-08-30 14:01:10 -0700123unsigned int aom_sub_pixel_avg_variance32xh_avx2(
clang-format1214cee2016-08-08 22:59:08 -0700124 const uint8_t *src, int src_stride, int x_offset, int y_offset,
125 const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
126 int height, unsigned int *sseptr);
Johann6a82f0d2015-06-05 09:54:19 -0700127
Yaowu Xuf883b422016-08-30 14:01:10 -0700128unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
clang-format1214cee2016-08-08 22:59:08 -0700129 int src_stride, int x_offset,
130 int y_offset, const uint8_t *dst,
Johann6a82f0d2015-06-05 09:54:19 -0700131 int dst_stride,
132 unsigned int *sse) {
133 unsigned int sse1;
Yaowu Xuf883b422016-08-30 14:01:10 -0700134 const int se1 = aom_sub_pixel_variance32xh_avx2(
clang-format1214cee2016-08-08 22:59:08 -0700135 src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
Johann6a82f0d2015-06-05 09:54:19 -0700136 unsigned int sse2;
clang-format1214cee2016-08-08 22:59:08 -0700137 const int se2 =
Yaowu Xuf883b422016-08-30 14:01:10 -0700138 aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
clang-format1214cee2016-08-08 22:59:08 -0700139 dst + 32, dst_stride, 64, &sse2);
Johann6a82f0d2015-06-05 09:54:19 -0700140 const int se = se1 + se2;
Yi Luoe9fde262016-10-07 15:02:33 -0700141 unsigned int variance;
Johann6a82f0d2015-06-05 09:54:19 -0700142 *sse = sse1 + sse2;
Yi Luoe9fde262016-10-07 15:02:33 -0700143
144 variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
145 _mm256_zeroupper();
146 return variance;
Johann6a82f0d2015-06-05 09:54:19 -0700147}
148
Yaowu Xuf883b422016-08-30 14:01:10 -0700149unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
clang-format1214cee2016-08-08 22:59:08 -0700150 int src_stride, int x_offset,
151 int y_offset, const uint8_t *dst,
Johann6a82f0d2015-06-05 09:54:19 -0700152 int dst_stride,
153 unsigned int *sse) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700154 const int se = aom_sub_pixel_variance32xh_avx2(
clang-format1214cee2016-08-08 22:59:08 -0700155 src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
Yi Luoe9fde262016-10-07 15:02:33 -0700156
157 const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
158 _mm256_zeroupper();
159 return variance;
Johann6a82f0d2015-06-05 09:54:19 -0700160}
161
Yaowu Xuf883b422016-08-30 14:01:10 -0700162unsigned int aom_sub_pixel_avg_variance64x64_avx2(
clang-format1214cee2016-08-08 22:59:08 -0700163 const uint8_t *src, int src_stride, int x_offset, int y_offset,
164 const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
Johann6a82f0d2015-06-05 09:54:19 -0700165 unsigned int sse1;
Yaowu Xuf883b422016-08-30 14:01:10 -0700166 const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
clang-format1214cee2016-08-08 22:59:08 -0700167 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
Johann6a82f0d2015-06-05 09:54:19 -0700168 unsigned int sse2;
Yaowu Xuf883b422016-08-30 14:01:10 -0700169 const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
clang-format1214cee2016-08-08 22:59:08 -0700170 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
171 64, 64, &sse2);
Johann6a82f0d2015-06-05 09:54:19 -0700172 const int se = se1 + se2;
Yi Luoe9fde262016-10-07 15:02:33 -0700173 unsigned int variance;
Johann6a82f0d2015-06-05 09:54:19 -0700174
175 *sse = sse1 + sse2;
176
Yi Luoe9fde262016-10-07 15:02:33 -0700177 variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
178 _mm256_zeroupper();
179 return variance;
Johann6a82f0d2015-06-05 09:54:19 -0700180}
181
Yaowu Xuf883b422016-08-30 14:01:10 -0700182unsigned int aom_sub_pixel_avg_variance32x32_avx2(
clang-format1214cee2016-08-08 22:59:08 -0700183 const uint8_t *src, int src_stride, int x_offset, int y_offset,
184 const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
Johann6a82f0d2015-06-05 09:54:19 -0700185 // Process 32 elements in parallel.
Yaowu Xuf883b422016-08-30 14:01:10 -0700186 const int se = aom_sub_pixel_avg_variance32xh_avx2(
clang-format1214cee2016-08-08 22:59:08 -0700187 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
Yi Luoe9fde262016-10-07 15:02:33 -0700188
189 const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
190 _mm256_zeroupper();
191 return variance;
Johann6a82f0d2015-06-05 09:54:19 -0700192}