JNT_COMP: add SIMD implementations for c functions Add SIMD implementations for c functions for low bit-depth, making encoder speed faster by 3~4x than c functions. Change-Id: Icca0b07b25489759be9504aaec09d1239076fc52

commit: ef34fff7755b87846007b85f5bafc46b43791f58 [log] [tgz]
author: Cheng Chen <chengchen@google.com> Mon Oct 30 15:59:26 2017 -0700
committer: Jingning Han <jingning@google.com> Mon Nov 06 16:01:22 2017 +0000
tree: 2bad144d70632d3716de0cc32ac5ee2ee66f2a5a
parent: f78632e00cb3c679b1485e9d3e34b824d878b0f4 [diff] [blame]
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 16ad001..8498093 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c

@@ -214,7 +214,7 @@
                                                                           \
     aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
                                                                           \
-    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);         \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);             \
   }
 #else  // CONFIG_JNT_COMP
 #define SUBPIX_AVG_VAR(W, H)                                            \
@@ -397,13 +397,11 @@
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
-  double sum = bck_offset + fwd_offset;
 
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
-      tmp = (int)(0.5 + tmp / sum);
-      if (tmp > 255) tmp = 255;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
       comp_pred[j] = (uint8_t)tmp;
     }
     comp_pred += width;
@@ -420,7 +418,6 @@
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
-  double sum = bck_offset + fwd_offset;
 
   aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
                      ref_stride);
@@ -428,8 +425,7 @@
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = (int)(0.5 + tmp / sum);
-      if (tmp > 255) tmp = 255;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
       comp_pred[j] = (uint8_t)tmp;
     }
     comp_pred += width;
commit	ef34fff7755b87846007b85f5bafc46b43791f58	[log] [tgz]
author	Cheng Chen <chengchen@google.com>	Mon Oct 30 15:59:26 2017 -0700
committer	Jingning Han <jingning@google.com>	Mon Nov 06 16:01:22 2017 +0000
tree	2bad144d70632d3716de0cc32ac5ee2ee66f2a5a
parent	f78632e00cb3c679b1485e9d3e34b824d878b0f4 [diff] [blame]