Fix bug in selfguided_restoration neon

Fixed bugs related to unintialization in
av1_apply_selfguided_restoration_neon and restoration_internal
functions.

BUG=b/141859709
BUG=b/141858830

Change-Id: I3359d0c8fda16f7e74296e6d34fb2070462dedf5
diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index d1e93d7..5593bcb 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c
@@ -376,6 +376,21 @@
       w -= 8;
       count++;
     } while (w > 0);
+
+    // memset needed for row pixels as 2nd stage of boxsum filter uses
+    // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
+    for (int x = 0; x < 2; x++) {
+      memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
+      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+    }
+
+    // memset needed for extra columns as 2nd stage of boxsum filter uses
+    // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
+    for (int x = 2; x < height + 2; x++) {
+      int dst_offset = x * dst_stride + width + 2;
+      memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
+      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+    }
   }
 
   {
@@ -792,6 +807,21 @@
       w -= 8;
       count++;
     } while (w > 0);
+
+    // memset needed for row pixels as 2nd stage of boxsum filter uses
+    // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
+    for (int x = 0; x < 2; x++) {
+      memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
+      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+    }
+
+    // memset needed for extra columns as 2nd stage of boxsum filter uses
+    // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
+    for (int x = 2; x < height + 2; x++) {
+      int dst_offset = x * dst_stride + width + 2;
+      memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
+      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+    }
   }
 
   {
@@ -1319,6 +1349,11 @@
       dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
     }
   }
+
+  // memeset for unintialized rows of src buffer as it needed for
+  // boxsum filter calculation.
+  for (int x = height; x < height + 5; x++)
+    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1360,6 +1395,10 @@
     memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
            sizeof(uint16_t) * width);
   }
+  // memeset for unintialized rows of src buffer as it needed for
+  // boxsum filter calculation.
+  for (int x = height; x < height + 5; x++)
+    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index c31b0c0..8db1d2f 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -402,7 +402,7 @@
 add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
 # TODO(b/141858830,b/141859709): neon is currently disabled due to use of
 # uninitialized memory.
-specialize qw/av1_apply_selfguided_restoration sse4_1 avx2/;
+specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
 
 add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index 8446d85..c1007f1 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -412,7 +412,7 @@
 #endif
 // TODO(b/141858830,b/141859709): neon is currently disabled due to use of
 // uninitialized memory.
-#if 0  // HAVE_NEON
+#if HAVE_NEON
 const int highbd_params_neon[] = { 8, 10, 12 };
 INSTANTIATE_TEST_CASE_P(
     NEON, AV1HighbdSelfguidedFilterTest,