Fix issues with SIMD horver_correlation_full On x86/x64 SIMD registers are shared with the FPU, so we need to clear state before doing FPU ops after using SIMD instructions, otherwise results may be unpredictable. Also, a zero-extend was erroneously used instead of a sign-extend on signed data. This wasn't caught by the unit tests because the test stimulus only generated unsigned data. I have updated the unit test so that it catches this issue, and changed the zero-extend to a sign-extend. This should fix the issue observed in https://aomedia-review.googlesource.com/c/aom/+/73884 so at the same time, reinstitute the SIMD optimised horver_correlation Change-Id: I8db61146e46f36c23119bd1abdbb466ab3dd50bc

commit: 819aff6309164628e81fb970cc3c0bcae611b3a9 [log] [tgz]
author: David Turner <david.turner@argondesign.com> Mon Oct 29 11:49:35 2018 +0000
committer: Debargha Mukherjee <debargha@google.com> Tue Oct 30 17:31:36 2018 +0000
tree: f66c1ce42213596ee247832e5e9f1f5399676f83
parent: dfd79b8ebfa6dbf1e438c3647be593b7dc702580 [diff]
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index aa2aea8..9cfbff9 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -1875,9 +1875,9 @@
   const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
   get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
                                 vfeatures);
-  av1_get_horver_correlation_full_c(diff, diff_stride, bw, bh,
-                                    &hfeatures[hfeatures_num - 1],
-                                    &vfeatures[vfeatures_num - 1]);
+  av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
+                                  &hfeatures[hfeatures_num - 1],
+                                  &vfeatures[vfeatures_num - 1]);
   av1_nn_predict(hfeatures, nn_config_hor, hscores);
   av1_nn_predict(vfeatures, nn_config_ver, vscores);
 

diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index a94c076..f588bad 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c

@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <immintrin.h>
 #include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_ports/system_state.h"
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/rdopt.h"
@@ -226,6 +227,8 @@
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
+  aom_clear_system_state();
+
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 

diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index f5ffae7..67d94b4 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c

@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <emmintrin.h>
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/system_state.h"
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/rdopt.h"
@@ -67,7 +68,7 @@
   const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a);
   const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a);
   // sum_slli_a32 = [c+b a k+j i] as i32
-  const __m128i swap_b32 = _mm_cvtepu16_epi32(swap_b);
+  const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b);
   // swap_b32 = [g f e 0] as i32
   *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32);
   *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32);
@@ -245,6 +246,8 @@
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
+  aom_clear_system_state();
+
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 

diff --git a/test/horver_correlation_test.cc b/test/horver_correlation_test.cc
index 5fca4b6..9a178cf 100644
--- a/test/horver_correlation_test.cc
+++ b/test/horver_correlation_test.cc

@@ -59,7 +59,7 @@
       float hcorr_test = 0.0, vcorr_test = 0.0;
 
       for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-        data_buf[i] = rng_.Rand16() % (1 << 12);
+        data_buf[i] = (rng_.Rand16() % (1 << 12)) - (1 << 11);
       }
 
       av1_get_horver_correlation_full_c(data_buf, MAX_SB_SIZE, w, h, &hcorr_ref,
commit	819aff6309164628e81fb970cc3c0bcae611b3a9	[log] [tgz]
author	David Turner <david.turner@argondesign.com>	Mon Oct 29 11:49:35 2018 +0000
committer	Debargha Mukherjee <debargha@google.com>	Tue Oct 30 17:31:36 2018 +0000
tree	f66c1ce42213596ee247832e5e9f1f5399676f83
parent	dfd79b8ebfa6dbf1e438c3647be593b7dc702580 [diff]