Further optimizations of loop restoration

Change-Id: I4c4300f3f565d8aecf65669b77aaa874bb73a3a0
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index d904a32..9ee58ff 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -121,10 +121,10 @@
   return filt_err;
 }
 
-static int64_t get_pixel_proj_error(int64_t *src, int width, int height,
-                                    int src_stride, int64_t *dgd,
-                                    int dgd_stride, int64_t *flt1,
-                                    int flt1_stride, int64_t *flt2,
+static int64_t get_pixel_proj_error(int32_t *src, int width, int height,
+                                    int src_stride, int32_t *dgd,
+                                    int dgd_stride, int32_t *flt1,
+                                    int flt1_stride, int32_t *flt2,
                                     int flt2_stride, int *xqd) {
   int i, j;
   int64_t err = 0;
@@ -132,12 +132,12 @@
   decode_xq(xqd, xq);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
-      const int64_t s = (int64_t)src[i * src_stride + j];
-      const int64_t u = (int64_t)dgd[i * dgd_stride + j];
-      const int64_t f1 = (int64_t)flt1[i * flt1_stride + j] - u;
-      const int64_t f2 = (int64_t)flt2[i * flt2_stride + j] - u;
+      const int32_t s = (int32_t)src[i * src_stride + j];
+      const int32_t u = (int32_t)dgd[i * dgd_stride + j];
+      const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
+      const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
       const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-      const int64_t e =
+      const int32_t e =
           ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
           ROUND_POWER_OF_TWO(s, SGRPROJ_RST_BITS);
       err += e * e;
@@ -146,9 +146,9 @@
   return err;
 }
 
-static void get_proj_subspace(int64_t *src, int width, int height,
-                              int src_stride, int64_t *dgd, int dgd_stride,
-                              int64_t *flt1, int flt1_stride, int64_t *flt2,
+static void get_proj_subspace(int32_t *src, int width, int height,
+                              int src_stride, int32_t *dgd, int dgd_stride,
+                              int32_t *flt1, int flt1_stride, int32_t *flt2,
                               int flt2_stride, int *xq) {
   int i, j;
   double H[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -198,10 +198,10 @@
                                           int src_stride, int bit_depth,
                                           int *eps, int *xqd, void *srcbuf,
                                           void *rstbuf) {
-  int64_t *srd = (int64_t *)srcbuf;
-  int64_t *dgd = (int64_t *)rstbuf;
-  int64_t *flt1 = dgd + RESTORATION_TILEPELS_MAX;
-  int64_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *srd = (int32_t *)srcbuf;
+  int32_t *dgd = (int32_t *)rstbuf;
+  int32_t *flt1 = dgd + RESTORATION_TILEPELS_MAX;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
   uint8_t *tmpbuf2 = (uint8_t *)(flt2 + RESTORATION_TILEPELS_MAX);
   int i, j, ep, bestep = 0;
   int64_t err, besterr = -1;
@@ -213,11 +213,11 @@
       uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
       for (i = 0; i < height; ++i) {
         for (j = 0; j < width; ++j) {
-          flt1[i * width + j] = (int64_t)dat[i * dat_stride + j];
-          flt2[i * width + j] = (int64_t)dat[i * dat_stride + j];
-          dgd[i * width + j] = (int64_t)dat[i * dat_stride + j]
+          flt1[i * width + j] = (int32_t)dat[i * dat_stride + j];
+          flt2[i * width + j] = (int32_t)dat[i * dat_stride + j];
+          dgd[i * width + j] = (int32_t)dat[i * dat_stride + j]
                                << SGRPROJ_RST_BITS;
-          srd[i * width + j] = (int64_t)src[i * src_stride + j]
+          srd[i * width + j] = (int32_t)src[i * src_stride + j]
                                << SGRPROJ_RST_BITS;
         }
       }
@@ -228,10 +228,10 @@
         for (j = 0; j < width; ++j) {
           const int k = i * width + j;
           const int l = i * dat_stride + j;
-          flt1[k] = (int64_t)dat[l];
-          flt2[k] = (int64_t)dat[l];
-          dgd[k] = (int64_t)dat[l] << SGRPROJ_RST_BITS;
-          srd[k] = (int64_t)src[i * src_stride + j] << SGRPROJ_RST_BITS;
+          flt1[k] = (int32_t)dat[l];
+          flt2[k] = (int32_t)dat[l];
+          dgd[k] = (int32_t)dat[l] << SGRPROJ_RST_BITS;
+          srd[k] = (int32_t)src[i * src_stride + j] << SGRPROJ_RST_BITS;
         }
       }
     }