JNT_COMP: add SIMD and interface for high bit-depth

Add high bit-depth macro definitions:
highbd_jnt_sad
highbd_8(10/12)_jnt_sub_pixel_avg.

Add SIMD functions:
aom_highbd_jnt_comp_avg_pred_sse2
aom_highbd_jnt_comp_avg_upsampled_pred_sse2

This patch also solves the seg fault caused by low bit-depth and
high bit-depth paths

BUG=aomedia:967
BUG=aomedia:944

Change-Id: Iea69f114e81ca226a30d84a540ad846f1b94b8d6
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 71a00f4..b47eca3 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -655,6 +655,128 @@
                                                dst, dst_stride, sse);        \
   }
 
+#if CONFIG_JNT_COMP
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                              dst, dst_stride, sse);          \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
+                                          dst_stride, sse);                   \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }
+#else  // CONFIG_JNT_COMP
 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
@@ -715,6 +837,7 @@
     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
                                                dst, dst_stride, sse);        \
   }
+#endif  // CONFIG_JNT_COMP
 
 /* All three forms of the variance are available in the same sizes. */
 #define HIGHBD_VARIANCES(W, H) \
@@ -853,6 +976,53 @@
     pred += width;
   }
 }
+
+#if CONFIG_JNT_COMP
+void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride,
+                                    const JNT_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_highbd_jnt_comp_avg_upsampled_pred_c(
+    uint16_t *comp_pred, const uint8_t *pred8, int width, int height,
+    int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
+    int bd, const JNT_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
+                            ref8, ref_stride, bd);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+#endif  // CONFIG_JNT_COMP
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if CONFIG_AV1