blob: bf89427872e7fc98148622d15bd9db1a7e434f4c [file] [log] [blame]
Vishesh58df5b12019-09-13 11:51:26 +05301/*
2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <emmintrin.h>
13
14#include "config/aom_dsp_rtcd.h"
15
16static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
17 int *x_sum, int64_t *x2_sum) {
18 const int16_t *data_tmp = data;
19 __m128i temp_buffer1, temp_buffer2;
20 __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
21 __m128i one = _mm_set1_epi16(1);
22 __m128i regx_sum = _mm_setzero_si128();
23 __m128i regx2_sum = regx_sum;
24
25 for (int j = 0; j < (bh >> 1); ++j) {
26 // Load 2 rows (8 pixels) at a time.
27 load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
28 load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
29 load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
30 sum_buffer = _mm_madd_epi16(load_pixels_low, one);
31 sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
32 regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
33 regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
34 data_tmp += 2 * stride;
35 }
36
37 regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
38 regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
39 *x_sum = _mm_cvtsi128_si32(regx_sum);
40 temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
41 temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
42 regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
43 regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
James Zern1fbb25c2023-05-22 10:45:24 -070044#if AOM_ARCH_X86_64
Vishesh58df5b12019-09-13 11:51:26 +053045 *x2_sum += _mm_cvtsi128_si64(regx2_sum);
46#else
47 {
48 int64_t tmp;
49 _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
50 *x2_sum += tmp;
51 }
52#endif
53}
54
55static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
56 int *x_sum, int64_t *x2_sum,
57 int loop_cycles) {
58 const int16_t *data_tmp;
59 __m128i temp_buffer1, temp_buffer2;
60 __m128i one = _mm_set1_epi16(1);
61 __m128i regx_sum = _mm_setzero_si128();
62 __m128i regx2_sum = regx_sum;
63 __m128i load_pixels, sum_buffer, sse_buffer;
64
65 for (int i = 0; i < loop_cycles; ++i) {
66 data_tmp = data + (8 * i);
67 for (int j = 0; j < bh; ++j) {
68 // Load 1 row (8-pixels) at a time.
69 load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
70 sum_buffer = _mm_madd_epi16(load_pixels, one);
71 sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
72 regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
73 regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
74 data_tmp += stride;
75 }
76 }
77
78 regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
79 regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
80 *x_sum += _mm_cvtsi128_si32(regx_sum);
81 temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
82 temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
83 regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
84 regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
James Zern1fbb25c2023-05-22 10:45:24 -070085#if AOM_ARCH_X86_64
Vishesh58df5b12019-09-13 11:51:26 +053086 *x2_sum += _mm_cvtsi128_si64(regx2_sum);
87#else
88 {
89 int64_t tmp;
90 _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
91 *x2_sum += tmp;
92 }
93#endif
94}
95
96// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
97void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
98 int *x_sum, int64_t *x2_sum) {
99 *x_sum = 0;
100 *x2_sum = 0;
101
102 if ((bh & 3) == 0) {
103 switch (bw) {
104 case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
105 case 8:
106 case 16:
107 sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
108 break;
109 // For widths 32 and 64, the registers may overflow. So compute
110 // partial widths at a time.
111 case 32:
112 if (bh <= 32) {
113 sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
114 break;
115 } else {
116 sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
117 sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
118 bw >> 3);
119 break;
120 }
121
122 case 64:
123 if (bh <= 16) {
124 sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
125 break;
126 } else {
127 for (int i = 0; i < bh; i += 16)
128 sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
129 bw >> 3);
130 break;
131 }
132
133 default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
134 }
135 } else {
136 aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
137 }
138}