Add x and y gradients to image pyramid

This currently uses floating point and will eventually need to
be integerized if disflow proves to give good speedup.

Change-Id: I7038cb8d5769622f9723d264c9996b197fb6d186
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 1f11126..8ba3ed4 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -73,6 +73,45 @@
   }
 }
 
+void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
+                               int dst_stride, int w, int h, int dir,
+                               double norm) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
+  DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
+  const int taps = 3;
+  int im_h = h + taps - 1;
+  int im_stride = w;
+  const int fo_vert = 1;
+  const int fo_horiz = 1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = dir ? sobel_a : sobel_b;
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int16_t sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      im_block[y * im_stride + x] = sum;
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = dir ? sobel_b : sobel_a;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int16_t sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      dst[y * dst_stride + x] = sum * norm;
+    }
+  }
+}
+
 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                           int dst_stride, int w, int h,
                           const InterpFilterParams *filter_params_x,
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 4109dd8..d0972db 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -118,6 +118,11 @@
                                    const struct scale_factors *sf,
                                    int is_intrabc, int bd);
 
+// TODO(sarahparker) This will need to be integerized and optimized
+void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
+                               int dst_stride, int w, int h, int dir,
+                               double norm);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index 3aaf22d..8b12ce4 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -17,6 +17,7 @@
 
 #include "av1/encoder/global_motion.h"
 
+#include "av1/common/convolve.h"
 #include "av1/common/resize.h"
 #include "av1/common/warped_motion.h"
 
@@ -50,6 +51,8 @@
   int strides[N_LEVELS];
   int level_loc[N_LEVELS];
   unsigned char *level_buffer;
+  double *level_dx_buffer;
+  double *level_dy_buffer;
 } ImagePyramid;
 
 static const double erroradv_tr[] = { 0.65, 0.60, 0.55 };
@@ -316,19 +319,47 @@
   return 0;
 }
 #else
+// Compute an image gradient using a sobel filter.
+// If dir == 1, compute the x gradient. If dir == 0, compute y. This function
+// assumes the images have been padded so that they can be processed in units
+// of 8.
+static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride,
+                                           double *dst, int dst_stride,
+                                           int height, int width, int dir) {
+  double norm = 1.0 / 8;
+  // TODO(sarahparker) experiment with doing this over larger block sizes
+  const int block_unit = 8;
+  // Filter in 8x8 blocks to eventually make use of optimized convolve function
+  for (int i = 0; i < height; i += block_unit) {
+    for (int j = 0; j < width; j += block_unit) {
+      av1_convolve_2d_sobel_y_c(src + i * src_stride + j, src_stride,
+                                dst + i * dst_stride + j, dst_stride,
+                                block_unit, block_unit, dir, norm);
+    }
+  }
+}
+
 static ImagePyramid *alloc_pyramid(int width, int height, int pad_size) {
   ImagePyramid *pyr = aom_malloc(sizeof(*pyr));
   // 2 * width * height is the upper bound for a buffer that fits
   // all pyramid levels + padding for each level
   const int buffer_size = sizeof(*pyr->level_buffer) * 2 * width * height +
                           (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
+  const int gradient_size = sizeof(*pyr->level_dx_buffer) * 2 * width * height +
+                            (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
   pyr->level_buffer = aom_malloc(buffer_size);
+  pyr->level_dx_buffer = aom_malloc(gradient_size);
+  pyr->level_dy_buffer = aom_malloc(gradient_size);
   memset(pyr->level_buffer, 0, buffer_size);
+  memset(pyr->level_dx_buffer, 0, gradient_size);
+  memset(pyr->level_dy_buffer, 0, gradient_size);
   return pyr;
 }
 
 static void free_pyramid(ImagePyramid *pyr) {
   aom_free(pyr->level_buffer);
+  aom_free(pyr->level_dx_buffer);
+  aom_free(pyr->level_dy_buffer);
   aom_free(pyr);
 }
 
@@ -384,7 +415,15 @@
                      frm_pyr->level_buffer + cur_loc, cur_height, cur_width,
                      cur_stride);
 
-    // TODO(sarahparker) Add computation of gradient pyramids here
+    // Computation x gradient
+    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                            frm_pyr->level_dx_buffer + cur_loc, cur_stride,
+                            cur_height, cur_width, 1);
+
+    // Computation y gradient
+    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                            frm_pyr->level_dy_buffer + cur_loc, cur_stride,
+                            cur_height, cur_width, 0);
   }
 }
 
@@ -416,6 +455,12 @@
     ref_buffer = downconvert_frame(ref, bit_depth);
   }
 
+  // TODO(sarahparker) We will want to do the source pyramid computation
+  // outside of this function so it doesn't get recomputed for every
+  // reference. We also don't need to compute every pyramid level for the
+  // reference in advance, since lower levels can be overwritten once their
+  // flow field is computed and upscaled. I'll add these optimizations
+  // once the full implementation is working.
   // Allocate frm image pyramids
   ImagePyramid *frm_pyr = alloc_pyramid(frm_width, frm_height, pad_size);
   compute_flow_pyramids(frm_buffer, frm_width, frm_height, frm->y_stride,