Turn on jnt_comp by default

Turn of CONFIG_RD_DEBUG when jnt_comp is on, to avoid stack size
overflow.

Make subpel processing for width <= 4 correct.

Change-Id: Ic1de96ff2eff4a80543e19531fa75511b0a2f427
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index b567736..b1f5bad 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -25,7 +25,7 @@
     const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
     unsigned int pixel_step, unsigned int output_height,
     unsigned int output_width, const uint8_t *filter) {
-  // Note: filter[0], filter[1] and be {128, 0}, where 128 will overflow
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
   // in computation using _mm_maddubs_epi16.
   // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
   const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
@@ -70,8 +70,7 @@
       a += src_pixels_per_line - output_width;
     }
   } else {
-    // output_width := 4, process two lines
-    for (i = 0; i < output_height; i += 2) {
+    for (i = 0; i < output_height; ++i) {
       // load source, only first 5 values are meaningful:
       // { a[0], a[1], a[2], a[3], a[4], xxxx }
       __m128i source = xx_loadl_64(a);
@@ -79,27 +78,15 @@
       // shuffle, up to the first 8 are useful
       // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
       //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-      __m128i shuffle_lo = _mm_shuffle_epi8(source, shuffle_mask);
-
-      __m128i source_high_0 = xx_loadl_32(a + src_pixels_per_line);
-      __m128i source_high_1 = _mm_setzero_si128();
-      // avoid load undefined memory
-      if (a + src_pixels_per_line + 4 != NULL)
-        source_high_1 = xx_loadl_32(a + src_pixels_per_line + 4);
-      source = _mm_unpacklo_epi32(source_high_0, source_high_1);
-
-      __m128i shuffle_hi = _mm_shuffle_epi8(source, shuffle_mask);
-
-      __m128i source_shuffle = _mm_unpacklo_epi64(shuffle_lo, shuffle_hi);
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
 
       __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
       res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
 
       xx_storel_64(b, res);
-      xx_storel_64(b + output_width, _mm_srli_si128(res, 8));
 
-      a += src_pixels_per_line * 2;
-      b += output_width * 2;
+      a += src_pixels_per_line;
+      b += output_width;
     }
   }
 }
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index b4b887f..fc9f725 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -125,7 +125,7 @@
 set(CONFIG_INTRABC 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_INTRA_EDGE 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_INTRA_EDGE2 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_JNT_COMP 0 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_JNT_COMP 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_LOOPFILTERING_ACROSS_TILES 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_LOOPFILTERING_ACROSS_TILES_EXT 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_LOOPFILTER_LEVEL 1 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_experiment_deps.cmake b/build/cmake/aom_experiment_deps.cmake
index 45a6341..59a11d7 100644
--- a/build/cmake/aom_experiment_deps.cmake
+++ b/build/cmake/aom_experiment_deps.cmake
@@ -64,6 +64,12 @@
       change_config_and_warn(CONFIG_AOM_QM 1 CONFIG_AOM_QM_EXT)
     endif ()
   endif ()
+
+  if (CONFIG_JNT_COMP)
+    if (CONFIG_RD_DEBUG)
+      change_config_and_warn(CONFIG_RD_DEBUG 0 CONFIG_JNT_COMP)
+    endif()
+  endif()
 endmacro ()
 
 endif ()  # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_