JNT_COMP: add SIMD implementations for c functions
Add SIMD implementations for c functions for low bit-depth, making
encoder speed faster by 3~4x than c functions.
Change-Id: Icca0b07b25489759be9504aaec09d1239076fc52
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 16ad001..8498093 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -214,7 +214,7 @@
\
aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
\
- return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+ return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
}
#else // CONFIG_JNT_COMP
#define SUBPIX_AVG_VAR(W, H) \
@@ -397,13 +397,11 @@
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
- double sum = bck_offset + fwd_offset;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
- tmp = (int)(0.5 + tmp / sum);
- if (tmp > 255) tmp = 255;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
comp_pred[j] = (uint8_t)tmp;
}
comp_pred += width;
@@ -420,7 +418,6 @@
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
- double sum = bck_offset + fwd_offset;
aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
ref_stride);
@@ -428,8 +425,7 @@
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
- tmp = (int)(0.5 + tmp / sum);
- if (tmp > 255) tmp = 255;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
comp_pred[j] = (uint8_t)tmp;
}
comp_pred += width;