JNT_COMP: add SIMD implementations for c functions

Add SIMD implementations for c functions for low bit-depth, making
encoder speed faster by 3~4x than c functions.

Change-Id: Icca0b07b25489759be9504aaec09d1239076fc52
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 16ad001..8498093 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -214,7 +214,7 @@
                                                                           \
     aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
                                                                           \
-    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);         \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);             \
   }
 #else  // CONFIG_JNT_COMP
 #define SUBPIX_AVG_VAR(W, H)                                            \
@@ -397,13 +397,11 @@
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
-  double sum = bck_offset + fwd_offset;
 
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
-      tmp = (int)(0.5 + tmp / sum);
-      if (tmp > 255) tmp = 255;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
       comp_pred[j] = (uint8_t)tmp;
     }
     comp_pred += width;
@@ -420,7 +418,6 @@
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
-  double sum = bck_offset + fwd_offset;
 
   aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
                      ref_stride);
@@ -428,8 +425,7 @@
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = (int)(0.5 + tmp / sum);
-      if (tmp > 255) tmp = 255;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
       comp_pred[j] = (uint8_t)tmp;
     }
     comp_pred += width;