Turn on jnt_comp by default
Turn of CONFIG_RD_DEBUG when jnt_comp is on, to avoid stack size
overflow.
Make subpel processing for width <= 4 correct.
Change-Id: Ic1de96ff2eff4a80543e19531fa75511b0a2f427
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index b567736..b1f5bad 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -25,7 +25,7 @@
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter) {
- // Note: filter[0], filter[1] and be {128, 0}, where 128 will overflow
+ // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
// in computation using _mm_maddubs_epi16.
// Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
@@ -70,8 +70,7 @@
a += src_pixels_per_line - output_width;
}
} else {
- // output_width := 4, process two lines
- for (i = 0; i < output_height; i += 2) {
+ for (i = 0; i < output_height; ++i) {
// load source, only first 5 values are meaningful:
// { a[0], a[1], a[2], a[3], a[4], xxxx }
__m128i source = xx_loadl_64(a);
@@ -79,27 +78,15 @@
// shuffle, up to the first 8 are useful
// { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
// a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
- __m128i shuffle_lo = _mm_shuffle_epi8(source, shuffle_mask);
-
- __m128i source_high_0 = xx_loadl_32(a + src_pixels_per_line);
- __m128i source_high_1 = _mm_setzero_si128();
- // avoid load undefined memory
- if (a + src_pixels_per_line + 4 != NULL)
- source_high_1 = xx_loadl_32(a + src_pixels_per_line + 4);
- source = _mm_unpacklo_epi32(source_high_0, source_high_1);
-
- __m128i shuffle_hi = _mm_shuffle_epi8(source, shuffle_mask);
-
- __m128i source_shuffle = _mm_unpacklo_epi64(shuffle_lo, shuffle_hi);
+ __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
__m128i res = _mm_maddubs_epi16(source_shuffle, filters);
res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
xx_storel_64(b, res);
- xx_storel_64(b + output_width, _mm_srli_si128(res, 8));
- a += src_pixels_per_line * 2;
- b += output_width * 2;
+ a += src_pixels_per_line;
+ b += output_width;
}
}
}
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index b4b887f..fc9f725 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -125,7 +125,7 @@
set(CONFIG_INTRABC 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_INTRA_EDGE 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_INTRA_EDGE2 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_JNT_COMP 0 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_JNT_COMP 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_LOOPFILTERING_ACROSS_TILES 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_LOOPFILTERING_ACROSS_TILES_EXT 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_LOOPFILTER_LEVEL 1 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_experiment_deps.cmake b/build/cmake/aom_experiment_deps.cmake
index 45a6341..59a11d7 100644
--- a/build/cmake/aom_experiment_deps.cmake
+++ b/build/cmake/aom_experiment_deps.cmake
@@ -64,6 +64,12 @@
change_config_and_warn(CONFIG_AOM_QM 1 CONFIG_AOM_QM_EXT)
endif ()
endif ()
+
+ if (CONFIG_JNT_COMP)
+ if (CONFIG_RD_DEBUG)
+ change_config_and_warn(CONFIG_RD_DEBUG 0 CONFIG_JNT_COMP)
+ endif()
+ endif()
endmacro ()
endif () # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_