Add INLINE for compute_jnt_comp_avg
Some compiler such as msvc 2013 doesn't inline
this small function and the highbd version.
For encoder, about 1.1% speed up after adding
INLINE, shows by encode 10 frames of foreman_cif.
Change-Id: I8aab0819ae8660eb8b849c70082c678dfbf581fb
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index ceedd0f..84ffa88 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -678,9 +678,11 @@
}
}
-static void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
- const __m128i *w0, const __m128i *w1,
- const __m128i *r, void *const result) {
+static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w0,
+ const __m128i *w1,
+ const __m128i *r,
+ void *const result) {
__m128i mult0 = _mm_mullo_epi16(*p0, *w0);
__m128i mult1 = _mm_mullo_epi16(*p1, *w1);
__m128i sum = _mm_add_epi16(mult0, mult1);
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index c731af0..79417ac 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -136,8 +136,9 @@
}
}
-static void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, const __m128i *w,
- const __m128i *r, void *const result) {
+static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w, const __m128i *r,
+ void *const result) {
__m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
__m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
__m128i round_lo = _mm_add_epi16(mult_lo, *r);