Merge "Fix a merge bug between dual_filter and sub8x8mc" into nextgenv2
diff --git a/av1/common/dering.c b/av1/common/dering.c
index 908c588..89ccffa 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -90,7 +90,7 @@
 }
 
 /* TODO: Optimize this function for SSE. */
-void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src,
     unsigned char (*bskip)[2], int dering_count, int bsize)
 {
   int bi, bx, by;
@@ -100,7 +100,7 @@
       bx = bskip[bi][1];
       copy_8x8_16_8bit(&dst[(by << 3) * dstride + (bx << 3)],
                      dstride,
-                     &src[(by << 3) * sstride + (bx << 3)], sstride);
+                     &src[bi << 2*bsize], 1 << bsize);
     }
   } else {
     for (bi = 0; bi < dering_count; bi++) {
@@ -108,7 +108,35 @@
       bx = bskip[bi][1];
       copy_4x4_16_8bit(&dst[(by << 2) * dstride + (bx << 2)],
                      dstride,
-                     &src[(by << 2) * sstride + (bx << 2)], sstride);
+                     &src[bi << 2*bsize], 1 << bsize);
+    }
+  }
+}
+
+/* TODO: Optimize this function for SSE. */
+static void copy_sb8_16(AV1_COMMON *cm, int16_t *dst, int dstride,
+    const uint8_t *src, int src_voffset, int src_hoffset, int sstride,
+    int vsize, int hsize)
+{
+  int r, c;
+  (void)cm;
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    const uint16_t *base = &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride
+                                                     + src_hoffset];
+    for (r = 0; r < vsize; r++) {
+      for (c = 0; c < hsize; c++) {
+        dst[r * dstride + c] = base[r*sstride + c];
+      }
+    }
+  } else
+#endif
+  {
+    const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
+    for (r = 0; r < vsize; r++) {
+      for (c = 0; c < hsize; c++) {
+        dst[r * dstride + c] = base[r*sstride + c];
+      }
     }
   }
 }
@@ -118,14 +146,19 @@
   int r, c;
   int sbr, sbc;
   int nhsb, nvsb;
-  od_dering_in *src[3];
+  od_dering_in src[OD_DERING_INBUF_SIZE];
+  int16_t *_linebuf[3];
+  int16_t *linebuf[3];
+  int16_t colbuf[3][OD_BSIZE_MAX + 2*OD_FILT_VBORDER][OD_FILT_HBORDER];
   unsigned char bskip[MAX_MIB_SIZE*MAX_MIB_SIZE][2];
+  unsigned char *row_dering, *prev_row_dering, *curr_row_dering;
   int dering_count;
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride;
   int bsize[3];
   int dec[3];
   int pli;
+  int last_sbc;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
   int nplanes;
   if (xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
@@ -136,45 +169,145 @@
   nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
-  for (pli = 0; pli < 3; pli++) {
+  row_dering = aom_malloc(sizeof(*row_dering) * nhsb * 2);
+  memset(row_dering, 1, sizeof(*row_dering) * (nhsb + 2) * 2);
+  prev_row_dering = row_dering + 1;
+  curr_row_dering = prev_row_dering + nhsb + 2;
+  for (pli = 0; pli < nplanes; pli++) {
     dec[pli] = xd->plane[pli].subsampling_x;
-    bsize[pli] = 8 >> dec[pli];
+    bsize[pli] = 3 - dec[pli];
   }
-  stride = bsize[0] * cm->mi_cols;
-  for (pli = 0; pli < 3; pli++) {
-    src[pli] = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
-    for (r = 0; r < bsize[pli] * cm->mi_rows; ++r) {
-      for (c = 0; c < bsize[pli] * cm->mi_cols; ++c) {
-#if CONFIG_AOM_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          src[pli][r * stride + c] = CONVERT_TO_SHORTPTR(
-              xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
-        } else {
-#endif
-          src[pli][r * stride + c] =
-              xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
-#if CONFIG_AOM_HIGHBITDEPTH
-        }
-#endif
-      }
-    }
+  stride = (cm->mi_cols << bsize[0]) + 2*OD_FILT_HBORDER;
+  for (pli = 0; pli < nplanes; pli++) {
+    int i;
+    _linebuf[pli] = aom_malloc(sizeof(*_linebuf) * OD_FILT_VBORDER * stride);
+    for (i = 0; i < OD_FILT_VBORDER * stride; i++)
+      _linebuf[pli][i] = OD_DERING_VERY_LARGE;
+    linebuf[pli] = _linebuf[pli] + OD_FILT_HBORDER;
   }
   for (sbr = 0; sbr < nvsb; sbr++) {
+    last_sbc = -1;
+    for (pli = 0; pli < nplanes; pli++) {
+      for (r = 0; r < (MAX_MIB_SIZE << bsize[pli]) + 2*OD_FILT_VBORDER; r++) {
+        for (c = 0; c < OD_FILT_HBORDER; c++) {
+          colbuf[pli][r][c] = OD_DERING_VERY_LARGE;
+        }
+      }
+    }
     for (sbc = 0; sbc < nhsb; sbc++) {
       int level;
       int nhb, nvb;
+      int cstart = 0;
+      if (sbc != last_sbc + 1)
+        cstart = -OD_FILT_HBORDER;
       nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
       nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
       level = compute_level_from_index(
           global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
                                             MAX_MIB_SIZE * sbc]
                             ->mbmi.dering_gain);
+      curr_row_dering[sbc] = 0;
       if (level == 0 ||
           sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
         continue;
+      curr_row_dering[sbc] = 1;
       for (pli = 0; pli < nplanes; pli++) {
         int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
         int threshold;
+        int16_t *in;
+        int i, j;
+        int coffset;
+        int rend, cend;
+        if (sbc == nhsb - 1)
+          cend = (nhb << bsize[pli]);
+        else
+          cend = (nhb << bsize[pli]) + OD_FILT_HBORDER;
+        if (sbr == nvsb - 1)
+          rend = (nvb << bsize[pli]);
+        else
+          rend = (nvb << bsize[pli]) + OD_FILT_VBORDER;
+        coffset = sbc * MAX_MIB_SIZE << bsize[pli];
+        if (sbc == nhsb - 1) {
+          /* On the last superblock column, fill in the right border with
+             OD_DERING_VERY_LARGE to avoid filtering with the outside. */
+          for (r = 0; r < rend; r++) {
+            for (c = cend; c < (nhb << bsize[pli]) + OD_FILT_HBORDER; ++c) {
+              src[(r + OD_FILT_VBORDER) * OD_FILT_BSTRIDE + c + OD_FILT_HBORDER]
+                  = OD_DERING_VERY_LARGE;
+            }
+          }
+        }
+        if (sbr == nvsb - 1) {
+          /* On the last superblock row, fill in the bottom border with
+             OD_DERING_VERY_LARGE to avoid filtering with the outside. */
+          for (r = rend; r < rend + OD_FILT_VBORDER; r++) {
+            for (c = 0; c < (nhb << bsize[pli]) + 2*OD_FILT_HBORDER; c++) {
+              src[(r + OD_FILT_VBORDER) * OD_FILT_BSTRIDE + c] =
+                  OD_DERING_VERY_LARGE;
+            }
+          }
+        }
+        in = src + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
+        /* We avoid filtering the pixels for which some of the pixels to average
+           are outside the frame. We could change the filter instead, but it would
+           add special cases for any future vectorization. */
+        for (i = -OD_FILT_VBORDER; i < 0; i++) {
+          for (j = -OD_FILT_HBORDER;
+              j < (nhb << bsize[pli]) + OD_FILT_HBORDER;
+              j++) {
+            in[i * OD_FILT_BSTRIDE + j] =
+                linebuf[pli][(OD_FILT_VBORDER + i) * stride +
+                                  (sbc * MAX_MIB_SIZE << bsize[pli]) + j];
+          }
+        }
+        /* Copy in the pixels we need from the current superblock for
+           deringing.*/
+        copy_sb8_16(cm, &src[OD_FILT_VBORDER*OD_FILT_BSTRIDE + OD_FILT_HBORDER
+                             + cstart], OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
+            (MAX_MIB_SIZE << bsize[pli]) * sbr, coffset + cstart,
+            xd->plane[pli].dst.stride, rend, cend-cstart);
+        if (!prev_row_dering[sbc]) {
+          copy_sb8_16(cm, &src[OD_FILT_HBORDER], OD_FILT_BSTRIDE,
+              xd->plane[pli].dst.buf,
+              (MAX_MIB_SIZE << bsize[pli]) * sbr - OD_FILT_VBORDER, coffset,
+              xd->plane[pli].dst.stride, OD_FILT_VBORDER, nhb << bsize[pli]);
+        }
+        if (!prev_row_dering[sbc - 1]) {
+          copy_sb8_16(cm, src, OD_FILT_BSTRIDE,
+              xd->plane[pli].dst.buf,
+              (MAX_MIB_SIZE << bsize[pli]) * sbr - OD_FILT_VBORDER,
+              coffset - OD_FILT_HBORDER,
+              xd->plane[pli].dst.stride, OD_FILT_VBORDER, OD_FILT_HBORDER);
+        }
+        if (!prev_row_dering[sbc + 1]) {
+          copy_sb8_16(cm, &src[OD_FILT_HBORDER + (nhb << bsize[pli])],
+              OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
+              (MAX_MIB_SIZE << bsize[pli]) * sbr - OD_FILT_VBORDER,
+              coffset + (nhb << bsize[pli]),
+              xd->plane[pli].dst.stride, OD_FILT_VBORDER, OD_FILT_HBORDER);
+        }
+        if (sbc == last_sbc + 1) {
+          /* If we deringed the superblock on the left then we need to copy in
+             saved pixels. */
+          for (r = 0; r < rend + OD_FILT_VBORDER; r++) {
+            for (c = 0; c < OD_FILT_HBORDER; c++) {
+              src[r * OD_FILT_BSTRIDE + c] = colbuf[pli][r][c];
+            }
+          }
+        }
+        for (r = 0; r < rend + OD_FILT_VBORDER; r++) {
+          for (c = 0; c < OD_FILT_HBORDER; c++) {
+            /* Saving pixels in case we need to dering the superblock on the
+               right. */
+            colbuf[pli][r][c] = src[r * OD_FILT_BSTRIDE + c
+                                    + (nhb << bsize[pli])];
+          }
+        }
+        copy_sb8_16(cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf,
+            (MAX_MIB_SIZE << bsize[pli]) * (sbr + 1) - OD_FILT_VBORDER, coffset,
+            xd->plane[pli].dst.stride, OD_FILT_VBORDER,
+            (nhb << bsize[pli]));
+
         /* FIXME: This is a temporary hack that uses more conservative
            deringing for chroma. */
         if (pli)
@@ -182,35 +315,40 @@
         else
           threshold = level << coeff_shift;
         if (threshold == 0) continue;
-        od_dering(dst, MAX_MIB_SIZE * bsize[pli],
-                  &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
-                            sbc * bsize[pli] * MAX_MIB_SIZE],
-                  stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
+        od_dering(dst, in, dec[pli], dir, pli,
                   bskip, dering_count, threshold, coeff_shift);
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           copy_blocks_16bit(
               (int16_t*)&CONVERT_TO_SHORTPTR(
                   xd->plane[pli].dst.buf)[xd->plane[pli].dst.stride *
-                  (bsize[pli] * MAX_MIB_SIZE * sbr) +
-                  sbc * bsize[pli] * MAX_MIB_SIZE],
-              xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+                  (MAX_MIB_SIZE * sbr << bsize[pli]) +
+                  (sbc * MAX_MIB_SIZE << bsize[pli])],
+              xd->plane[pli].dst.stride, dst, bskip,
               dering_count, 3 - dec[pli]);
         } else {
 #endif
           copy_blocks_16_8bit(
               &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
-                                    (bsize[pli] * MAX_MIB_SIZE * sbr) +
-                                    sbc * bsize[pli] * MAX_MIB_SIZE],
-              xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+                                    (MAX_MIB_SIZE * sbr << bsize[pli]) +
+                                    (sbc * MAX_MIB_SIZE << bsize[pli])],
+              xd->plane[pli].dst.stride, dst, bskip,
               dering_count, 3 - dec[pli]);
 #if CONFIG_AOM_HIGHBITDEPTH
         }
 #endif
       }
+      last_sbc = sbc;
+    }
+    {
+      unsigned char *tmp;
+      tmp = prev_row_dering;
+      prev_row_dering = curr_row_dering;
+      curr_row_dering = tmp;
     }
   }
+  aom_free(row_dering);
   for (pli = 0; pli < nplanes; pli++) {
-    aom_free(src[pli]);
+    aom_free(_linebuf[pli]);
   }
 }
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index f19291c..168bab1 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -111,10 +111,6 @@
   return best_dir;
 }
 
-#define OD_DERING_VERY_LARGE (30000)
-#define OD_DERING_INBUF_SIZE \
-  ((OD_BSIZE_MAX + 2 * OD_FILT_BORDER) * (OD_BSIZE_MAX + 2 * OD_FILT_BORDER))
-
 /* Smooth in the direction detected. */
 int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
                                      int threshold, int dir) {
@@ -277,7 +273,7 @@
 }
 
 /* TODO: Optimize this function for SSE. */
-void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src,
     unsigned char (*bskip)[2], int dering_count, int bsize)
 {
   int bi, bx, by;
@@ -287,7 +283,7 @@
       bx = bskip[bi][1];
       copy_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)],
                      dstride,
-                     &src[(by << 3) * sstride + (bx << 3)], sstride);
+                     &src[bi << 2*bsize], 1 << bsize);
     }
   } else {
     for (bi = 0; bi < dering_count; bi++) {
@@ -295,25 +291,19 @@
       bx = bskip[bi][1];
       copy_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)],
                      dstride,
-                     &src[(by << 2) * sstride + (bx << 2)], sstride);
+                     &src[bi << 2*bsize], 1 << bsize);
     }
   }
 }
 
-void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
-               int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
+void od_dering(int16_t *y, int16_t *in, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                unsigned char (*bskip)[2], int dering_count, int threshold,
                int coeff_shift) {
-  int i;
-  int j;
   int bi;
   int bx;
   int by;
-  int16_t inbuf[OD_DERING_INBUF_SIZE];
-  int16_t *in;
   int bsize;
-  int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
   int filter2_thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
   od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
     od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
@@ -322,24 +312,13 @@
     od_filter_dering_orthogonal_4x4, od_filter_dering_orthogonal_8x8
   };
   bsize = 3 - xdec;
-  in = inbuf + OD_FILT_BORDER * OD_FILT_BSTRIDE + OD_FILT_BORDER;
-  /* We avoid filtering the pixels for which some of the pixels to average
-     are outside the frame. We could change the filter instead, but it would
-     add special cases for any future vectorization. */
-  for (i = 0; i < OD_DERING_INBUF_SIZE; i++) inbuf[i] = OD_DERING_VERY_LARGE;
-  for (i = -OD_FILT_BORDER * (sby != 0);
-       i < (nvb << bsize) + OD_FILT_BORDER * (sby != nvsb - 1); i++) {
-    for (j = -OD_FILT_BORDER * (sbx != 0);
-         j < (nhb << bsize) + OD_FILT_BORDER * (sbx != nhsb - 1); j++) {
-      in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
-    }
-  }
   if (pli == 0) {
     for (bi = 0; bi < dering_count; bi++) {
+      int32_t var;
       by = bskip[bi][0];
       bx = bskip[bi][1];
-      dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
-                                 &var[by][bx], coeff_shift);
+      dir[by][bx] = od_dir_find8(&in[8 * by * OD_FILT_BSTRIDE + 8 * bx], OD_FILT_BSTRIDE,
+                                 &var, coeff_shift);
       /* Deringing orthogonal to the direction uses a tighter threshold
          because we want to be conservative. We've presumably already
          achieved some deringing, so the amount of change is expected
@@ -349,28 +328,28 @@
          since the ringing there tends to be directional, so it doesn't
          get removed by the directional filtering. */
       filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+          &y[bi << 2*bsize], 1 << bsize,
           &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
-          od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
+          od_adjust_thresh(threshold, var), dir[by][bx]);
     }
   } else {
     for (bi = 0; bi < dering_count; bi++) {
       by = bskip[bi][0];
       bx = bskip[bi][1];
       filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+          &y[bi << 2*bsize], 1 << bsize,
           &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
           dir[by][bx]);
     }
   }
-  copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, ystride, bskip, dering_count,
+  copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, bskip, dering_count,
       bsize);
   for (bi = 0; bi < dering_count; bi++) {
     by = bskip[bi][0];
     bx = bskip[bi][1];
     if (filter2_thresh[by][bx] == 0) continue;
     (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
-        &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+        &y[bi << 2*bsize], 1 << bsize,
         &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
         dir[by][bx]);
   }
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 97090e5..e10871e 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -25,8 +25,16 @@
 
 #define OD_DERING_NBLOCKS (OD_BSIZE_MAX / 8)
 
-#define OD_FILT_BORDER (3)
-#define OD_FILT_BSTRIDE (OD_BSIZE_MAX + 2 * OD_FILT_BORDER)
+/* We need to buffer three vertical lines. */
+#define OD_FILT_VBORDER (3)
+/* We only need to buffer three horizontal lines too, but let's make it four
+   to make vectorization easier. */
+#define OD_FILT_HBORDER (4)
+#define OD_FILT_BSTRIDE (OD_BSIZE_MAX + 2 * OD_FILT_HBORDER)
+
+#define OD_DERING_VERY_LARGE (30000)
+#define OD_DERING_INBUF_SIZE \
+  (OD_FILT_BSTRIDE * (OD_BSIZE_MAX + 2 * OD_FILT_VBORDER))
 
 extern const int OD_DIRECTION_OFFSETS_TABLE[8][3];
 
@@ -36,11 +44,10 @@
 typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
                                                  const int16_t *in,
                                                  int threshold, int dir);
-void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src,
     unsigned char (*bskip)[2], int dering_count, int bsize);
 
-void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
-               int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
+void od_dering(int16_t *y, int16_t *in, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                unsigned char (*bskip)[2], int skip_stride, int threshold,
                int coeff_shift);
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index e5ed39d..f73e777 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -35,18 +35,18 @@
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((x << (WARPEDMODEL_PREC_BITS + 1)) + mat[1]),
+          ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
           WARPEDDIFF_PREC_BITS + 1);
     else
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((x << WARPEDMODEL_PREC_BITS) + mat[1]), WARPEDDIFF_PREC_BITS);
+          ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[1]), WARPEDDIFF_PREC_BITS);
     if (subsampling_y)
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((y << (WARPEDMODEL_PREC_BITS + 1)) + mat[0]),
+          ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
           WARPEDDIFF_PREC_BITS + 1);
     else
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((y << WARPEDMODEL_PREC_BITS)) + mat[0], WARPEDDIFF_PREC_BITS);
+          ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[0], WARPEDDIFF_PREC_BITS);
     points += stride_points - 2;
     proj += stride_proj - 2;
   }
@@ -119,12 +119,12 @@
     y = (subsampling_y ? 4 * y + 1 : 2 * y);
 
     Z = (mat[7] * x + mat[6] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
-    xp = (mat[1] * x + mat[0] * y + 2 * mat[3])
-         << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-             WARPEDMODEL_PREC_BITS);
-    yp = (mat[2] * x + mat[5] * y + 2 * mat[4])
-         << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-             WARPEDMODEL_PREC_BITS);
+    xp = (mat[1] * x + mat[0] * y + 2 * mat[3]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+    yp = (mat[2] * x + mat[5] * y + 2 * mat[4]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
 
     xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
     yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
@@ -220,9 +220,9 @@
     const int64_t v3 = x * (p[1] - p[-1]);
     const int64_t v4 = 2 * p[0];
     return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
-        (v4 << (3 * WARPEDPIXEL_PREC_BITS)) +
-            (v3 << (2 * WARPEDPIXEL_PREC_BITS)) +
-            (v2 << WARPEDPIXEL_PREC_BITS) + v1,
+        (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
+            (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
+            (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
         3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
   }
 }
@@ -246,10 +246,10 @@
                   i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
                   j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
     arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                            y - (j << WARPEDPIXEL_PREC_BITS));
+                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
   }
   val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                       x - (i << WARPEDPIXEL_PREC_BITS));
+                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
   return (uint8_t)clip_pixel(val);
 }
@@ -262,9 +262,10 @@
   for (k = 0; k < 4; ++k) {
     int32_t arr_temp[4];
     get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] = do_cubic_filter(arr_temp + 1, y - (j << WARPEDPIXEL_PREC_BITS));
+    arr[k] =
+        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
   }
-  val = do_cubic_filter(arr + 1, x - (i << WARPEDPIXEL_PREC_BITS));
+  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
   return (uint8_t)clip_pixel(val);
 }
@@ -272,8 +273,8 @@
 static uint8_t bi_linear_filter(uint8_t *ref, int x, int y, int stride) {
   const int ix = x >> WARPEDPIXEL_PREC_BITS;
   const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
-  const int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
   int32_t val;
   val = ROUND_POWER_OF_TWO_SIGNED(
       ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
@@ -289,8 +290,8 @@
                                 int height, int stride) {
   int ix = x >> WARPEDPIXEL_PREC_BITS;
   int iy = y >> WARPEDPIXEL_PREC_BITS;
-  int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
-  int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
   int32_t v;
 
   if (ix < 0 && iy < 0)
@@ -357,10 +358,10 @@
                          i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
                          j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
     arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                            y - (j << WARPEDPIXEL_PREC_BITS));
+                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
   }
   val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                       x - (i << WARPEDPIXEL_PREC_BITS));
+                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
   return (uint16_t)clip_pixel_highbd(val, bd);
 }
@@ -374,9 +375,10 @@
   for (k = 0; k < 4; ++k) {
     int32_t arr_temp[4];
     highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] = do_cubic_filter(arr_temp + 1, y - (j << WARPEDPIXEL_PREC_BITS));
+    arr[k] =
+        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
   }
-  val = do_cubic_filter(arr + 1, x - (i << WARPEDPIXEL_PREC_BITS));
+  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
   return (uint16_t)clip_pixel_highbd(val, bd);
 }
@@ -385,8 +387,8 @@
                                         int bd) {
   const int ix = x >> WARPEDPIXEL_PREC_BITS;
   const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
-  const int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
   int32_t val;
   val = ROUND_POWER_OF_TWO_SIGNED(
       ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
@@ -402,8 +404,8 @@
                                         int height, int stride, int bd) {
   int ix = x >> WARPEDPIXEL_PREC_BITS;
   int iy = y >> WARPEDPIXEL_PREC_BITS;
-  int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
-  int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
   int32_t v;
 
   if (ix < 0 && iy < 0)
diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 0c79e45..0b65740 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -58,11 +58,11 @@
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
   for (pli = 0; pli < 3; pli++) {
     dec[pli] = xd->plane[pli].subsampling_x;
-    bsize[pli] = 8 >> dec[pli];
+    bsize[pli] = 3 - dec[pli];
   }
-  stride = bsize[0] * cm->mi_cols;
-  for (r = 0; r < bsize[0] * cm->mi_rows; ++r) {
-    for (c = 0; c < bsize[0] * cm->mi_cols; ++c) {
+  stride = cm->mi_cols << bsize[0];
+  for (r = 0; r < cm->mi_rows << bsize[0]; ++r) {
+    for (c = 0; c < cm->mi_cols << bsize[0]; ++c) {
 #if CONFIG_AOM_HIGHBITDEPTH
       if (cm->use_highbitdepth) {
         src[r * stride + c] = CONVERT_TO_SHORTPTR(
@@ -98,6 +98,7 @@
       int best_gi;
       int32_t best_mse = INT32_MAX;
       int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
+      int16_t tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
       nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
       nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
       if (sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
@@ -106,26 +107,41 @@
       for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
         int cur_mse;
         int threshold;
+        int16_t inbuf[OD_DERING_INBUF_SIZE];
+        int16_t *in;
+        int i, j;
         level = compute_level_from_index(best_level, gi);
         threshold = level << coeff_shift;
-        for (r = 0; r < bsize[0] * nvb; r++) {
-          for (c = 0; c < bsize[0] * nhb; c++) {
-            dst[r * MAX_MIB_SIZE * bsize[0] + c] =
-                src[(sbr * bsize[0] * MAX_MIB_SIZE + r) * stride +
-                    sbc * bsize[0] * MAX_MIB_SIZE + c];
+        for (r = 0; r < nvb << bsize[0]; r++) {
+          for (c = 0; c < nhb << bsize[0]; c++) {
+            dst[(r * MAX_MIB_SIZE << bsize[0]) + c] =
+                src[((sbr * MAX_MIB_SIZE << bsize[0]) + r) * stride +
+                    (sbc * MAX_MIB_SIZE << bsize[0]) + c];
           }
         }
-        od_dering(dst, MAX_MIB_SIZE * bsize[0],
-                  &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
-                       sbc * bsize[0] * MAX_MIB_SIZE],
-                  cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
-                  dir, 0,
-                  bskip,
-                  dering_count, threshold, coeff_shift);
+        in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
+        /* We avoid filtering the pixels for which some of the pixels to average
+           are outside the frame. We could change the filter instead, but it would
+           add special cases for any future vectorization. */
+        for (i = 0; i < OD_DERING_INBUF_SIZE; i++) inbuf[i] = OD_DERING_VERY_LARGE;
+        for (i = -OD_FILT_VBORDER * (sbr != 0);
+             i < (nvb << bsize[0]) + OD_FILT_VBORDER * (sbr != nvsb - 1); i++) {
+          for (j = -OD_FILT_HBORDER * (sbc != 0);
+               j < (nhb << bsize[0]) + OD_FILT_HBORDER * (sbc != nhsb - 1); j++) {
+            int16_t *x;
+            x = &src[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
+                     (sbc * MAX_MIB_SIZE << bsize[0])];
+            in[i * OD_FILT_BSTRIDE + j] = x[i * stride + j];
+          }
+        }
+        od_dering(tmp_dst, in, 0, dir, 0, bskip, dering_count, threshold,
+            coeff_shift);
+        copy_blocks_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst, bskip,
+            dering_count, 3);
         cur_mse = (int)compute_dist(
-            dst, MAX_MIB_SIZE * bsize[0],
-            &ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +
-                       sbc * bsize[0] * MAX_MIB_SIZE],
+            dst, MAX_MIB_SIZE << bsize[0],
+            &ref_coeff[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
+                       (sbc * MAX_MIB_SIZE << bsize[0])],
             stride, nhb, nvb, coeff_shift);
         if (cur_mse < best_mse) {
           best_gi = gi;