Fix two bugs in highbitdepth self-guided filter

This filter was temporarily removed due to test failures.
This patch reintroduces the filter and fixes two bugs:

* The test cases would occasionally segfault on x86, since
  the highbd filter requires its inputs to be aligned to
  16 bytes. This will always be true when used on real videos,
  so adjust the test cases to match.

* The function calc_block was incorrect for bit_depth > 8,
  due to passing an incorrect argument to _mm_srl_epi32().
  This was the cause of the original test failures.

BUG=aomedia:392

Change-Id: Ia06b76c3e6122eebadd0995fb62f32c2fcab8b3e
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b389b70..9b03e5c 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -791,7 +791,7 @@
 
   if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
-    specialize qw/apply_selfguided_restoration_highbd /;
+    specialize qw/apply_selfguided_restoration_highbd sse4_1/;
 
     add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
     specialize qw/av1_selfguided_restoration_highbd sse4_1/;
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 78c3341..87018b5 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -16,10 +16,10 @@
   if (bit_depth > 8) {
     __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
     __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
-    a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a),
-                      _mm_set1_epi32(2 * (bit_depth - 8)));
-    b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b),
-                      _mm_set1_epi32(bit_depth - 8));
+    __m128i shift_a = _mm_set_epi64x(0, 2 * (bit_depth - 8));
+    __m128i shift_b = _mm_set_epi64x(0, bit_depth - 8);
+    a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
+    b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
     a = _mm_mullo_epi32(a, n);
     b = _mm_mullo_epi32(b, b);
     p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
@@ -1719,4 +1719,87 @@
   }
 }
 
+void apply_selfguided_restoration_highbd_sse4_1(
+    uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
+    int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
+  int xq[2];
+  int32_t *flt1 = tmpbuf;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+  int i, j;
+  assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+  av1_highpass_filter_highbd_sse4_1(dat, width, height, stride, flt1, width,
+                                    sgr_params[eps].corner,
+                                    sgr_params[eps].edge);
+#else
+  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
+                                           width, bit_depth, sgr_params[eps].r1,
+                                           sgr_params[eps].e1, tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
+                                           width, bit_depth, sgr_params[eps].r2,
+                                           sgr_params[eps].e2, tmpbuf2);
+  decode_xq(xqd, xq);
+
+  __m128i xq0 = _mm_set1_epi32(xq[0]);
+  __m128i xq1 = _mm_set1_epi32(xq[1]);
+  for (i = 0; i < height; ++i) {
+    // Calculate output in batches of 8 pixels
+    for (j = 0; j < width; j += 8) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      __m128i src =
+          _mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
+
+      const __m128i u_0 = _mm_cvtepu16_epi32(src);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
+
+      const __m128i f1_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
+      const __m128i f2_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
+      const __m128i f1_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
+      const __m128i f2_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
+
+      const __m128i v_0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
+          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
+      const __m128i v_1 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
+          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+      const __m128i rounding =
+          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      // Pack into 16 bits and clamp to [0, 2^bit_depth)
+      const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+      const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+      const __m128i res = _mm_min_epi16(tmp, max);
+
+      _mm_store_si128((__m128i *)&dst[m], res);
+    }
+    // Process leftover pixels
+    for (; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+      const int32_t f1 = (int32_t)flt1[k] - u;
+      const int32_t f2 = (int32_t)flt2[k] - u;
+      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
+    }
+  }
+}
+
 #endif
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index 6e4f5d8..04d658c 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -44,8 +44,8 @@
     const int NUM_ITERS = 2000;
     int i, j;
 
-    uint8_t *input = new uint8_t[w * h];
-    uint8_t *output = new uint8_t[w * h];
+    uint8_t *input = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
+    uint8_t *output = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
     memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
 
@@ -76,9 +76,9 @@
     printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
            elapsed, elapsed * 1000000. / NUM_ITERS);
 
+    aom_free(input);
+    aom_free(output);
     aom_free(tmpbuf);
-    delete[] input;
-    delete[] output;
   }
 
   void RunCorrectnessTest() {
@@ -89,9 +89,12 @@
     const int NUM_ITERS = 81;
     int i, j, k;
 
-    uint8_t *input = new uint8_t[stride * max_h];
-    uint8_t *output = new uint8_t[out_stride * max_h];
-    uint8_t *output2 = new uint8_t[out_stride * max_h];
+    uint8_t *input =
+        (uint8_t *)aom_memalign(16, stride * max_h * sizeof(uint8_t));
+    uint8_t *output =
+        (uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
+    uint8_t *output2 =
+        (uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
     memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
 
@@ -124,10 +127,10 @@
           ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
     }
 
+    aom_free(input);
+    aom_free(output);
+    aom_free(output2);
     aom_free(tmpbuf);
-    delete[] input;
-    delete[] output;
-    delete[] output2;
   }
 };
 
@@ -161,8 +164,8 @@
     int bit_depth = GET_PARAM(0);
     int mask = (1 << bit_depth) - 1;
 
-    uint16_t *input = new uint16_t[w * h];
-    uint16_t *output = new uint16_t[w * h];
+    uint16_t *input = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
+    uint16_t *output = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
     memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
 
@@ -194,9 +197,9 @@
     printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
            elapsed, elapsed * 1000000. / NUM_ITERS);
 
+    aom_free(input);
+    aom_free(output);
     aom_free(tmpbuf);
-    delete[] input;
-    delete[] output;
   }
 
   void RunCorrectnessTest() {
@@ -209,9 +212,12 @@
     int bit_depth = GET_PARAM(0);
     int mask = (1 << bit_depth) - 1;
 
-    uint16_t *input = new uint16_t[stride * max_h];
-    uint16_t *output = new uint16_t[out_stride * max_h];
-    uint16_t *output2 = new uint16_t[out_stride * max_h];
+    uint16_t *input =
+        (uint16_t *)aom_memalign(16, stride * max_h * sizeof(uint16_t));
+    uint16_t *output =
+        (uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
+    uint16_t *output2 =
+        (uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
     memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
 
@@ -246,10 +252,10 @@
           ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
     }
 
+    aom_free(input);
+    aom_free(output);
+    aom_free(output2);
     aom_free(tmpbuf);
-    delete[] input;
-    delete[] output;
-    delete[] output2;
   }
 };