blob: badb9f04451b040f08713ca76d0263af16e82bda [file] [log] [blame]
John Koleszar0ea50ce2010-05-18 11:58:33 -04001/*
John Koleszarc2140b82010-09-09 08:16:39 -04002 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar0ea50ce2010-05-18 11:58:33 -04003 *
John Koleszar94c52e42010-06-18 12:39:21 -04004 * Use of this source code is governed by a BSD-style license
John Koleszar09202d82010-06-04 16:19:40 -04005 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
John Koleszar94c52e42010-06-18 12:39:21 -04007 * in the file PATENTS. All contributing project authors may
John Koleszar09202d82010-06-04 16:19:40 -04008 * be found in the AUTHORS file in the root of the source tree.
John Koleszar0ea50ce2010-05-18 11:58:33 -04009 */
10
11
12#include "vpx_ports/config.h"
13#include "vpx_ports/x86.h"
John Koleszar02321de2011-02-10 14:41:38 -050014#include "vp8/encoder/variance.h"
15#include "vp8/encoder/onyx_int.h"
John Koleszar0ea50ce2010-05-18 11:58:33 -040016
17
18#if HAVE_MMX
Johann92b0e542011-06-14 11:31:50 -040019void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
John Koleszar0ea50ce2010-05-18 11:58:33 -040020{
Fritz Koenig5f0e0612010-10-21 10:53:15 -070021 vp8_short_fdct4x4_mmx(input, output, pitch);
22 vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
John Koleszar0ea50ce2010-05-18 11:58:33 -040023}
24
John Koleszar0ea50ce2010-05-18 11:58:33 -040025int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
26 short *qcoeff_ptr, short *dequant_ptr,
27 short *scan_mask, short *round_ptr,
28 short *quant_ptr, short *dqcoeff_ptr);
Johann92b0e542011-06-14 11:31:50 -040029void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
John Koleszar0ea50ce2010-05-18 11:58:33 -040030{
Timothy B. Terriberry8f75ea62010-10-21 17:04:30 -070031 short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
32 short *coeff_ptr = b->coeff;
33 short *zbin_ptr = b->zbin;
34 short *round_ptr = b->round;
Scott LaVarnway516ea842010-12-28 14:51:46 -050035 short *quant_ptr = b->quant_fast;
Timothy B. Terriberry8f75ea62010-10-21 17:04:30 -070036 short *qcoeff_ptr = d->qcoeff;
John Koleszar0ea50ce2010-05-18 11:58:33 -040037 short *dqcoeff_ptr = d->dqcoeff;
Timothy B. Terriberry8f75ea62010-10-21 17:04:30 -070038 short *dequant_ptr = d->dequant;
John Koleszar0ea50ce2010-05-18 11:58:33 -040039
40 d->eob = vp8_fast_quantize_b_impl_mmx(
41 coeff_ptr,
42 zbin_ptr,
43 qcoeff_ptr,
44 dequant_ptr,
45 scan_mask,
46
47 round_ptr,
48 quant_ptr,
49 dqcoeff_ptr
50 );
51}
52
53int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
Johann92b0e542011-06-14 11:31:50 -040054int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
John Koleszar0ea50ce2010-05-18 11:58:33 -040055{
56 short *coeff_ptr = mb->block[0].coeff;
57 short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
58 return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
59}
60
61int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
Johann92b0e542011-06-14 11:31:50 -040062int vp8_mbuverror_mmx(MACROBLOCK *mb)
John Koleszar0ea50ce2010-05-18 11:58:33 -040063{
64 short *s_ptr = &mb->coeff[256];
65 short *d_ptr = &mb->e_mbd.dqcoeff[256];
66 return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
67}
68
69void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
70 short *diff, unsigned char *predictor,
71 int pitch);
Johann92b0e542011-06-14 11:31:50 -040072void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
John Koleszar0ea50ce2010-05-18 11:58:33 -040073{
74 unsigned char *z = *(be->base_src) + be->src;
75 unsigned int src_stride = be->src_stride;
76 short *diff = &be->src_diff[0];
77 unsigned char *predictor = &bd->predictor[0];
78 vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
79}
80
81#endif
82
83#if HAVE_SSE2
John Koleszar0ea50ce2010-05-18 11:58:33 -040084int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
Johann92b0e542011-06-14 11:31:50 -040085int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
John Koleszar0ea50ce2010-05-18 11:58:33 -040086{
87 short *coeff_ptr = mb->block[0].coeff;
88 short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
89 return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
90}
91
92int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
Johann92b0e542011-06-14 11:31:50 -040093int vp8_mbuverror_xmm(MACROBLOCK *mb)
John Koleszar0ea50ce2010-05-18 11:58:33 -040094{
95 short *s_ptr = &mb->coeff[256];
96 short *d_ptr = &mb->e_mbd.dqcoeff[256];
97 return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
98}
99
Yunqing Wang4db20762010-10-18 14:15:15 -0400100void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
101 short *diff, unsigned char *predictor,
102 int pitch);
Johann92b0e542011-06-14 11:31:50 -0400103void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
Yunqing Wang4db20762010-10-18 14:15:15 -0400104{
105 unsigned char *z = *(be->base_src) + be->src;
106 unsigned int src_stride = be->src_stride;
107 short *diff = &be->src_diff[0];
108 unsigned char *predictor = &bd->predictor[0];
109 vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
110}
111
John Koleszar0ea50ce2010-05-18 11:58:33 -0400112#endif
113
Scott LaVarnwayff4a71f2010-11-01 16:24:15 -0400114#if HAVE_SSSE3
Yaowu Xu57ad1892011-04-29 09:37:59 -0700115#if CONFIG_INTERNAL_STATS
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500116#if ARCH_X86_64
117typedef void ssimpf
118(
119 unsigned char *s,
120 int sp,
121 unsigned char *r,
122 int rp,
123 unsigned long *sum_s,
124 unsigned long *sum_r,
125 unsigned long *sum_sq_s,
126 unsigned long *sum_sq_r,
127 unsigned long *sum_sxr
128);
129
130extern ssimpf vp8_ssim_parms_16x16_sse3;
131extern ssimpf vp8_ssim_parms_8x8_sse3;
132#endif
133#endif
Scott LaVarnwayff4a71f2010-11-01 16:24:15 -0400134#endif
135
136
John Koleszar0ea50ce2010-05-18 11:58:33 -0400137void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
138{
139#if CONFIG_RUNTIME_CPU_DETECT
140 int flags = x86_simd_caps();
John Koleszar0ea50ce2010-05-18 11:58:33 -0400141
142 /* Note:
143 *
144 * This platform can be built without runtime CPU detection as well. If
145 * you modify any of the function mappings present in this file, be sure
146 * to also update them in static mapings (<arch>/filename_<arch>.h)
147 */
148
149 /* Override default functions with fastest ones for this CPU. */
150#if HAVE_MMX
Johanna7d4d3c2011-05-09 11:16:31 -0400151 if (flags & HAS_MMX)
John Koleszar0ea50ce2010-05-18 11:58:33 -0400152 {
153 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx;
154 cpi->rtcd.variance.sad16x8 = vp8_sad16x8_mmx;
155 cpi->rtcd.variance.sad8x16 = vp8_sad8x16_mmx;
156 cpi->rtcd.variance.sad8x8 = vp8_sad8x8_mmx;
157 cpi->rtcd.variance.sad4x4 = vp8_sad4x4_mmx;
158
159 cpi->rtcd.variance.var4x4 = vp8_variance4x4_mmx;
160 cpi->rtcd.variance.var8x8 = vp8_variance8x8_mmx;
161 cpi->rtcd.variance.var8x16 = vp8_variance8x16_mmx;
162 cpi->rtcd.variance.var16x8 = vp8_variance16x8_mmx;
163 cpi->rtcd.variance.var16x16 = vp8_variance16x16_mmx;
164
165 cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_mmx;
166 cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_mmx;
167 cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx;
168 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx;
169 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx;
John Koleszara0ae3682010-10-27 11:28:43 -0400170 cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx;
171 cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx;
172 cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400173 cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx;
174
175 cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx;
176 cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx;
177
John Koleszar0ea50ce2010-05-18 11:58:33 -0400178 cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
Fritz Koenig5f0e0612010-10-21 10:53:15 -0700179
John Koleszar0ea50ce2010-05-18 11:58:33 -0400180 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
Johann92b0e542011-06-14 11:31:50 -0400181 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
Yaowu Xud0dd01b2010-06-16 12:52:18 -0700182 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx;
Johann92b0e542011-06-14 11:31:50 -0400183 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx;
Yaowu Xud0dd01b2010-06-16 12:52:18 -0700184
John Koleszar0ea50ce2010-05-18 11:58:33 -0400185 cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
186
187 cpi->rtcd.encodemb.berr = vp8_block_error_mmx;
Johann92b0e542011-06-14 11:31:50 -0400188 cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx;
189 cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx;
190 cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400191 cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx;
192 cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx;
193
Johann92b0e542011-06-14 11:31:50 -0400194 /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/
John Koleszar0ea50ce2010-05-18 11:58:33 -0400195 }
John Koleszar0ea50ce2010-05-18 11:58:33 -0400196#endif
John Koleszar0ea50ce2010-05-18 11:58:33 -0400197
Yunqing Wang71ecb5d2010-10-27 08:45:24 -0400198#if HAVE_SSE2
Johanna7d4d3c2011-05-09 11:16:31 -0400199 if (flags & HAS_SSE2)
John Koleszar0ea50ce2010-05-18 11:58:33 -0400200 {
201 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt;
202 cpi->rtcd.variance.sad16x8 = vp8_sad16x8_wmt;
203 cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt;
204 cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt;
205 cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt;
Yunqing Wang20bd1442011-06-28 09:14:13 -0400206 cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse2;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400207
208 cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt;
209 cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt;
210 cpi->rtcd.variance.var8x16 = vp8_variance8x16_wmt;
211 cpi->rtcd.variance.var16x8 = vp8_variance16x8_wmt;
212 cpi->rtcd.variance.var16x16 = vp8_variance16x16_wmt;
213
214 cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_wmt;
215 cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_wmt;
216 cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt;
217 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt;
218 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt;
John Koleszara0ae3682010-10-27 11:28:43 -0400219 cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt;
220 cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt;
221 cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400222 cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt;
223
224 cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt;
225 cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2;
226
John Koleszar0ea50ce2010-05-18 11:58:33 -0400227 /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
228
Scott LaVarnwayf1a3b1e2010-06-24 13:11:30 -0400229 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2;
230 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_sse2;
231 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2;
232 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2;
233
Yunqing Wangfc94ffc2010-10-21 10:26:50 -0400234 cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400235
236 cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
Johann92b0e542011-06-14 11:31:50 -0400237 cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm;
238 cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
239 cpi->rtcd.encodemb.subb = vp8_subtract_b_sse2;
Yunqing Wang4db20762010-10-18 14:15:15 -0400240 cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2;
241 cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400242
Johann8edaf6e2011-02-10 14:57:43 -0500243 cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;
Johannc32e0ec2011-03-24 13:31:10 -0400244 cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
Johann8b0cf5f2010-12-22 11:23:51 -0500245
Attila Nagy7af0d902011-02-22 10:29:23 +0200246#if !(CONFIG_REALTIME_ONLY)
Johann8b0cf5f2010-12-22 11:23:51 -0500247 cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;
Attila Nagy7af0d902011-02-22 10:29:23 +0200248#endif
John Koleszar0ea50ce2010-05-18 11:58:33 -0400249 }
John Koleszar0ea50ce2010-05-18 11:58:33 -0400250#endif
John Koleszar0ea50ce2010-05-18 11:58:33 -0400251
Yunqing Wang71ecb5d2010-10-27 08:45:24 -0400252#if HAVE_SSE3
Johanna7d4d3c2011-05-09 11:16:31 -0400253 if (flags & HAS_SSE3)
John Koleszar0ea50ce2010-05-18 11:58:33 -0400254 {
255 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3;
256 cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_sse3;
257 cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_sse3;
258 cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3;
259 cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3;
260 cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3;
261 cpi->rtcd.search.full_search = vp8_full_search_sadx3;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400262 cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3;
263 cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3;
264 cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3;
265 cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3;
266 cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
Yunqing Wang20bd1442011-06-28 09:14:13 -0400267 cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse3;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400268 cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;
Yunqing Wangcb7b1fb2011-05-06 12:51:31 -0400269 cpi->rtcd.search.refining_search = vp8_refining_search_sadx4;
John Koleszar0ea50ce2010-05-18 11:58:33 -0400270 }
John Koleszar0ea50ce2010-05-18 11:58:33 -0400271#endif
John Koleszar0ea50ce2010-05-18 11:58:33 -0400272
Yunqing Wang71ecb5d2010-10-27 08:45:24 -0400273#if HAVE_SSSE3
Johanna7d4d3c2011-05-09 11:16:31 -0400274 if (flags & HAS_SSSE3)
John Koleszar0ea50ce2010-05-18 11:58:33 -0400275 {
276 cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
277 cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
Scott LaVarnwayff4a71f2010-11-01 16:24:15 -0400278
Yunqing Wang7b8e7f02011-03-09 11:16:30 -0500279 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_ssse3;
Yunqing Wang244e2e12011-03-03 19:02:45 -0500280 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
281
Johann Koenig08702002011-04-07 16:40:05 -0400282 cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
Scott LaVarnwayff4a71f2010-11-01 16:24:15 -0400283
Yaowu Xu57ad1892011-04-29 09:37:59 -0700284#if CONFIG_INTERNAL_STATS
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500285#if ARCH_X86_64
286 cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3;
287 cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3;
288#endif
289#endif
290
John Koleszar0ea50ce2010-05-18 11:58:33 -0400291 }
John Koleszar0ea50ce2010-05-18 11:58:33 -0400292#endif
Yunqing Wang71ecb5d2010-10-27 08:45:24 -0400293
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500294
295
Yunqing Wang71ecb5d2010-10-27 08:45:24 -0400296#if HAVE_SSE4_1
Johanna7d4d3c2011-05-09 11:16:31 -0400297 if (flags & HAS_SSE4_1)
Yunqing Wang71ecb5d2010-10-27 08:45:24 -0400298 {
299 cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_sse4;
300 cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_sse4;
301 cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4;
302 cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4;
303 cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4;
304 cpi->rtcd.search.full_search = vp8_full_search_sadx8;
Johann508ae1b2011-04-13 16:38:02 -0400305
306 cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse4;
Yunqing Wang71ecb5d2010-10-27 08:45:24 -0400307 }
308#endif
309
John Koleszar0ea50ce2010-05-18 11:58:33 -0400310#endif
311}