Fix issues with SIMD horver_correlation_full

On x86/x64 SIMD registers are shared with the FPU, so we need to clear
state before doing FPU ops after using SIMD instructions, otherwise
results may be unpredictable.

Also, a zero-extend was erroneously used instead of a sign-extend on
signed data.  This wasn't caught by the unit tests because the test
stimulus only generated unsigned data.  I have updated the unit test so
that it catches this issue, and changed the zero-extend to a
sign-extend.

This should fix the issue observed in
https://aomedia-review.googlesource.com/c/aom/+/73884
so at the same time, reinstitute the SIMD optimised horver_correlation

Change-Id: I8db61146e46f36c23119bd1abdbb466ab3dd50bc
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index aa2aea8..9cfbff9 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1875,9 +1875,9 @@
   const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
   get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
                                 vfeatures);
-  av1_get_horver_correlation_full_c(diff, diff_stride, bw, bh,
-                                    &hfeatures[hfeatures_num - 1],
-                                    &vfeatures[vfeatures_num - 1]);
+  av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
+                                  &hfeatures[hfeatures_num - 1],
+                                  &vfeatures[vfeatures_num - 1]);
   av1_nn_predict(hfeatures, nn_config_hor, hscores);
   av1_nn_predict(vfeatures, nn_config_ver, vscores);
 
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index a94c076..f588bad 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <immintrin.h>
 #include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_ports/system_state.h"
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/rdopt.h"
@@ -226,6 +227,8 @@
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
+  aom_clear_system_state();
+
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index f5ffae7..67d94b4 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <emmintrin.h>
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/system_state.h"
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/rdopt.h"
@@ -67,7 +68,7 @@
   const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a);
   const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a);
   // sum_slli_a32 = [c+b a k+j i] as i32
-  const __m128i swap_b32 = _mm_cvtepu16_epi32(swap_b);
+  const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b);
   // swap_b32 = [g f e 0] as i32
   *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32);
   *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32);
@@ -245,6 +246,8 @@
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
+  aom_clear_system_state();
+
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 
diff --git a/test/horver_correlation_test.cc b/test/horver_correlation_test.cc
index 5fca4b6..9a178cf 100644
--- a/test/horver_correlation_test.cc
+++ b/test/horver_correlation_test.cc
@@ -59,7 +59,7 @@
       float hcorr_test = 0.0, vcorr_test = 0.0;
 
       for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-        data_buf[i] = rng_.Rand16() % (1 << 12);
+        data_buf[i] = (rng_.Rand16() % (1 << 12)) - (1 << 11);
       }
 
       av1_get_horver_correlation_full_c(data_buf, MAX_SB_SIZE, w, h, &hcorr_ref,