aom_dsp/flow_estimation/disflow.c - avm - Git at Google

 /*
  * Copyright (c) 2022, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/flow_estimation/disflow.h"
 #include "aom_dsp/flow_estimation/corner_detect.h"
 #include "aom_dsp/flow_estimation/pyramid.h"
 #include "aom_dsp/flow_estimation/ransac.h"
 #include "aom_mem/aom_mem.h"

 #include "config/av1_rtcd.h"

 // TODO(rachelbarker): Move needed code from av1/ to aom_dsp/
 #include "av1/common/resize.h"

 #include <assert.h>

 // Size of square patches in the disflow dense grid
 #define PATCH_SIZE 8
 // Center point of square patch
 #define PATCH_CENTER ((PATCH_SIZE + 1) >> 1)
 // Step size between patches, lower value means greater patch overlap
 #define PATCH_STEP 1
 // Warp error convergence threshold for disflow
 #define DISFLOW_ERROR_TR 0.01
 // Max number of iterations if warp convergence is not found
 // TODO(rachelbarker): Experiment with different numbers of pyramid levels
 // and numbers of refinement steps per level
 #define DISFLOW_MAX_ITR 10

 // Don't use points around the frame border since they are less reliable
 static INLINE int valid_point(int x, int y, int width, int height) {
   return (x > (PATCH_SIZE + PATCH_CENTER)) &&
          (x < (width - PATCH_SIZE - PATCH_CENTER)) &&
          (y > (PATCH_SIZE + PATCH_CENTER)) &&
          (y < (height - PATCH_SIZE - PATCH_CENTER));
 }

 static int determine_disflow_correspondence(int *frm_corners,
                                             int num_frm_corners, double *flow_u,
                                             double *flow_v, int width,
                                             int height, int stride,
                                             Correspondence *correspondences) {
   int num_correspondences = 0;
   int x, y;
   for (int i = 0; i < num_frm_corners; ++i) {
     x = frm_corners[2 * i];
     y = frm_corners[2 * i + 1];
     if (valid_point(x, y, width, height)) {
       correspondences[num_correspondences].x = x;
       correspondences[num_correspondences].y = y;
       correspondences[num_correspondences].rx = x + flow_u[y * stride + x];
       correspondences[num_correspondences].ry = y + flow_v[y * stride + x];
       num_correspondences++;
     }
   }
   return num_correspondences;
 }

 static void getCubicKernel(double x, double *kernel) {
   assert(0 <= x && x < 1);
   double x2 = x * x;
   double x3 = x2 * x;
   kernel[0] = -0.5 * x + x2 - 0.5 * x3;
   kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
   kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
   kernel[3] = -0.5 * x2 + 0.5 * x3;
 }

 static double getCubicValue(double *p, double *kernel) {
   return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
          kernel[3] * p[3];
 }

 // Warps a block using flow vector [u, v] and computes the mse
 static double compute_warp_and_error(unsigned char *ref, unsigned char *frm,
                                      int width, int height, int stride, int x,
                                      int y, double u, double v, int16_t *dt) {
   unsigned char warped;
   int x_w, y_w;
   double mse = 0;
   int16_t err = 0;

   // Split offset into integer and fractional parts, and compute cubic
   // interpolation kernels
   int u_int = (int)floor(u);
   int v_int = (int)floor(v);
   double u_frac = u - u_int;
   double v_frac = v - v_int;

   double h_kernel[4];
   double v_kernel[4];
   getCubicKernel(u_frac, h_kernel);
   getCubicKernel(v_frac, v_kernel);

   // Storage for intermediate values between the two convolution directions
   double tmp_[PATCH_SIZE * (PATCH_SIZE + 3)];
   double *tmp = tmp_ + PATCH_SIZE;  // Offset by one row

   // Clamp coordinates so that all pixels we fetch will remain within the
   // allocated border region, but allow them to go far enough out that
   // the border pixels' values do not change.
   // Since we are calculating an 8x8 block, the bottom-right pixel
   // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
   // interpolation has 4 taps, meaning that the output of pixel
   // (x_w, y_w) depends on the pixels in the range
   // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
   //
   // Thus the most extreme coordinates which will be fetched are
   // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
   int x0 = clamp(x + u_int, -9, width);
   int y0 = clamp(y + v_int, -9, height);

   // Horizontal convolution
   for (int i = -1; i < PATCH_SIZE + 2; ++i) {
     y_w = y0 + i;
     for (int j = 0; j < PATCH_SIZE; ++j) {
       x_w = x0 + j;
       double arr[4];

       arr[0] = (double)ref[y_w * stride + (x_w - 1)];
       arr[1] = (double)ref[y_w * stride + (x_w + 0)];
       arr[2] = (double)ref[y_w * stride + (x_w + 1)];
       arr[3] = (double)ref[y_w * stride + (x_w + 2)];

       tmp[i * PATCH_SIZE + j] = getCubicValue(arr, h_kernel);
     }
   }

   // Vertical convolution
   for (int i = 0; i < PATCH_SIZE; ++i) {
     for (int j = 0; j < PATCH_SIZE; ++j) {
       double *p = &tmp[i * PATCH_SIZE + j];
       double arr[4] = { p[-PATCH_SIZE], p[0], p[PATCH_SIZE],
                         p[2 * PATCH_SIZE] };
       double result = getCubicValue(arr, v_kernel);

       warped = clamp((int)(result + 0.5), 0, 255);
       err = warped - frm[(x + j) + (y + i) * stride];
       mse += err * err;
       dt[i * PATCH_SIZE + j] = err;
     }
   }

   mse /= (PATCH_SIZE * PATCH_SIZE);
   return mse;
 }

 // Computes the components of the system of equations used to solve for
 // a flow vector. This includes:
 // 1.) The hessian matrix for optical flow. This matrix is in the
 // form of:
 //
 //       M = |sum(dx * dx)  sum(dx * dy)|
 //           |sum(dx * dy)  sum(dy * dy)|
 //
 // 2.)   b = |sum(dx * dt)|
 //           |sum(dy * dt)|
 // Where the sums are computed over a square window of PATCH_SIZE.
 static INLINE void compute_flow_system(const double *dx, int dx_stride,
                                        const double *dy, int dy_stride,
                                        const int16_t *dt, int dt_stride,
                                        double *M, double *b) {
   for (int i = 0; i < PATCH_SIZE; i++) {
     for (int j = 0; j < PATCH_SIZE; j++) {
       M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
       M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
       M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];

       b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
       b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
     }
   }

   M[2] = M[1];
 }

 // Solves a general Mx = b where M is a 2x2 matrix and b is a 2x1 matrix
 static INLINE void solve_2x2_system(const double *M, const double *b,
                                     double *output_vec) {
   double M_0 = M[0];
   double M_3 = M[3];
   double det = (M_0 * M_3) - (M[1] * M[2]);
   if (det < 1e-5) {
     // Handle singular matrix
     // TODO(sarahparker) compare results using pseudo inverse instead
     M_0 += 1e-10;
     M_3 += 1e-10;
     det = (M_0 * M_3) - (M[1] * M[2]);
   }
   const double det_inv = 1 / det;
   const double mult_b0 = det_inv * b[0];
   const double mult_b1 = det_inv * b[1];
   output_vec[0] = M_3 * mult_b0 - M[1] * mult_b1;
   output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1;
 }

 /*
 static INLINE void image_difference(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     int16_t *dst, int dst_stride, int height,
                                     int width) {
   const int block_unit = 8;
   // Take difference in 8x8 blocks to make use of optimized diff function
   for (int i = 0; i < height; i += block_unit) {
     for (int j = 0; j < width; j += block_unit) {
       aom_subtract_block(block_unit, block_unit, dst + i * dst_stride + j,
                          dst_stride, src + i * src_stride + j, src_stride,
                          ref + i * ref_stride + j, ref_stride);
     }
   }
 }
 */

 static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref,
                                          int x, int y, int width, int height,
                                          int stride, double *u, double *v) {
   double M[4] = { 0 };
   double b[2] = { 0 };
   double tmp_output_vec[2] = { 0 };
   double error = 0;
   int16_t dt[PATCH_SIZE * PATCH_SIZE];
   double o_u = *u;
   double o_v = *v;

   double dx_tmp[PATCH_SIZE * PATCH_SIZE];
   double dy_tmp[PATCH_SIZE * PATCH_SIZE];

   // Compute gradients within this patch
   unsigned char *frm_patch = &frm[y * stride + x];
   av1_convolve_2d_sobel_y_c(frm_patch, stride, dx_tmp, PATCH_SIZE, PATCH_SIZE,
                             PATCH_SIZE, 1, 1.0);
   av1_convolve_2d_sobel_y_c(frm_patch, stride, dy_tmp, PATCH_SIZE, PATCH_SIZE,
                             PATCH_SIZE, 0, 1.0);

   for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
     error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u,
                                    *v, dt);
     if (error <= DISFLOW_ERROR_TR) break;
     compute_flow_system(dx_tmp, PATCH_SIZE, dy_tmp, PATCH_SIZE, dt, PATCH_SIZE,
                         M, b);
     solve_2x2_system(M, b, tmp_output_vec);
     *u += tmp_output_vec[0];
     *v += tmp_output_vec[1];
   }
   if (fabs(*u - o_u) > PATCH_SIZE || fabs(*v - o_u) > PATCH_SIZE) {
     *u = o_u;
     *v = o_v;
   }
 }

 static void fill_flow_field_borders(double *flow, int width, int height,
                                     int stride) {
   // Calculate the bounds of the rectangle which was filled in by
   // compute_flow_field() before calling this function.
   // These indices are inclusive on both ends.
   const int left_index = PATCH_CENTER;
   const int right_index = (width - PATCH_SIZE - 1) + PATCH_CENTER;
   const int top_index = PATCH_CENTER;
   const int bottom_index = (height - PATCH_SIZE - 1) + PATCH_CENTER;

   // Left area
   for (int i = top_index; i <= bottom_index; i += PATCH_STEP) {
     double *row = flow + i * stride;
     double left = row[left_index];
     for (int j = 0; j < left_index; j++) {
       row[j] = left;
     }
   }

   // Right area
   for (int i = top_index; i <= bottom_index; i += PATCH_STEP) {
     double *row = flow + i * stride;
     double right = row[right_index];
     for (int j = right_index + 1; j < width; j++) {
       row[j] = right;
     }
   }

   // Top area
   double *top_row = flow + top_index * stride;
   for (int i = 0; i < top_index; i++) {
     double *row = flow + i * stride;
     memcpy(row, top_row, width * sizeof(double));
   }

   // Bottom area
   double *bottom_row = flow + bottom_index * stride;
   for (int i = bottom_index + 1; i < height; i++) {
     double *row = flow + i * stride;
     memcpy(row, bottom_row, width * sizeof(double));
   }
 }

 // make sure flow_u and flow_v start at 0
 static void compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
                                double *flow_u, double *flow_v) {
   int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center;
   double *u_upscale =
       aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
   double *v_upscale =
       aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));

   assert(frm_pyr->n_levels == ref_pyr->n_levels);

   // Compute flow field from coarsest to finest level of the pyramid
 #if PATCH_STEP != 1
   // TODO(rachelbarker): This function, as written, only works if PATCH_STEP
   // == 1. For any other value, the border filling and interpolation code will
   // need to be reworked to handle the fact that there will be rows and columns
   // with no flow data
 #error "compute_flow_field() needs updating for PATCH_STEP != 1"
 #endif

   for (int level = frm_pyr->n_levels - 1; level >= 0; --level) {
     cur_width = frm_pyr->widths[level];
     cur_height = frm_pyr->heights[level];
     cur_stride = frm_pyr->strides[level];
     cur_loc = frm_pyr->level_loc[level];

     for (int i = 0; i < cur_height - PATCH_SIZE; i += PATCH_STEP) {
       for (int j = 0; j < cur_width - PATCH_SIZE; j += PATCH_STEP) {
         patch_loc = i * cur_stride + j;
         patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER;
         compute_flow_at_point(frm_pyr->level_buffer + cur_loc,
                               ref_pyr->level_buffer + cur_loc, j, i, cur_width,
                               cur_height, cur_stride, flow_u + patch_center,
                               flow_v + patch_center);
       }
     }

     // Fill in the areas which we haven't explicitly computed, with copies
     // of the outermost values which we did compute
     fill_flow_field_borders(flow_u, cur_width, cur_height, cur_stride);
     fill_flow_field_borders(flow_v, cur_width, cur_height, cur_stride);

     if (level > 0) {
       int h_upscale = frm_pyr->heights[level - 1];
       int w_upscale = frm_pyr->widths[level - 1];
       int s_upscale = frm_pyr->strides[level - 1];
       av1_upscale_plane_double_prec(flow_u, cur_height, cur_width, cur_stride,
                                     u_upscale, h_upscale, w_upscale, s_upscale);
       av1_upscale_plane_double_prec(flow_v, cur_height, cur_width, cur_stride,
                                     v_upscale, h_upscale, w_upscale, s_upscale);

       // Multiply all flow vectors by 2.
       // When we move down a pyramid level, the image resolution doubles.
       // Thus we need to double all vectors in order for them to represent
       // the same translation at the next level down
       for (int i = 0; i < h_upscale; i++) {
         for (int j = 0; j < w_upscale; j++) {
           int index = i * s_upscale + j;
           flow_u[index] = u_upscale[index] * 2.0;
           flow_v[index] = v_upscale[index] * 2.0;
         }
       }
     }
   }
   aom_free(u_upscale);
   aom_free(v_upscale);
 }

 FlowField *aom_alloc_flow_field(int width, int height, int stride) {
   FlowField *flow = (FlowField *)aom_malloc(sizeof(FlowField));
   if (flow == NULL) return NULL;

   flow->width = width;
   flow->height = height;
   flow->stride = stride;

   size_t flow_size = stride * (size_t)height;
   flow->u = aom_calloc(flow_size, sizeof(double));
   flow->v = aom_calloc(flow_size, sizeof(double));

   if (flow->u == NULL || flow->v == NULL) {
     aom_free(flow->u);
     aom_free(flow->v);
     aom_free(flow);
     return NULL;
   }

   return flow;
 }

 void aom_free_flow_field(FlowField *flow) {
   aom_free(flow->u);
   aom_free(flow->v);
   aom_free(flow);
 }

 FlowField *aom_compute_flow_field(YV12_BUFFER_CONFIG *frm,
                                   YV12_BUFFER_CONFIG *ref, int bit_depth) {
   const int frm_width = frm->y_width;
   const int frm_height = frm->y_height;
   assert(frm->y_width == ref->y_width);
   assert(frm->y_height == ref->y_height);

   // Compute pyramids if necessary.
   // These are cached alongside the framebuffer to avoid unnecessary
   // recomputation. When the framebuffer is freed, or reused for a new frame,
   // these pyramids will be automatically freed.
   if (!frm->y_pyramid) {
     frm->y_pyramid = aom_compute_pyramid(frm, bit_depth, MAX_PYRAMID_LEVELS);
     assert(frm->y_pyramid);
   }
   if (!ref->y_pyramid) {
     ref->y_pyramid = aom_compute_pyramid(ref, bit_depth, MAX_PYRAMID_LEVELS);
     assert(ref->y_pyramid);
   }

   ImagePyramid *frm_pyr = frm->y_pyramid;
   ImagePyramid *ref_pyr = ref->y_pyramid;

   FlowField *flow =
       aom_alloc_flow_field(frm_width, frm_height, frm_pyr->strides[0]);

   compute_flow_field(frm_pyr, ref_pyr, flow->u, flow->v);

   return flow;
 }

 bool aom_fit_global_model_to_flow_field(FlowField *flow,
                                         TransformationType type,
                                         YV12_BUFFER_CONFIG *frm, int bit_depth,
                                         MotionModel *params_by_motion,
                                         int num_motions) {
   int num_correspondences;

   if (!frm->corners) {
     aom_find_corners_in_frame(frm, bit_depth);
   }

   // find correspondences between the two images using the flow field
   Correspondence *correspondences =
       aom_malloc(frm->num_corners * sizeof(*correspondences));
   num_correspondences = determine_disflow_correspondence(
       frm->corners, frm->num_corners, flow->u, flow->v, flow->width,
       flow->height, flow->stride, correspondences);
   ransac(correspondences, num_correspondences, type, params_by_motion,
          num_motions);

   aom_free(correspondences);
   // Set num_inliers = 0 for motions with too few inliers so they are ignored.
   for (int i = 0; i < num_motions; ++i) {
     if (params_by_motion[i].num_inliers <
         MIN_INLIER_PROB * num_correspondences) {
       params_by_motion[i].num_inliers = 0;
     }
   }

   // Return true if any one of the motions has inliers.
   for (int i = 0; i < num_motions; ++i) {
     if (params_by_motion[i].num_inliers > 0) return true;
   }
   return false;
 }

 bool aom_fit_local_model_to_flow_field(const FlowField *flow,
                                        const PixelRect *rect,
                                        TransformationType type, double *mat) {
   int width = rect_height(rect);
   int height = rect_width(rect);
   int num_points = width * height;

   // TODO(rachelbarker): Downsample if num_points is > some threshold?
   double *pts1 = aom_malloc(num_points * 2 * sizeof(double));
   double *pts2 = aom_malloc(num_points * 2 * sizeof(double));
   int index = 0;

   for (int y = rect->top; y < rect->bottom; y++) {
     for (int x = rect->left; x < rect->right; x++) {
       pts1[2 * index + 0] = (double)x;
       pts1[2 * index + 1] = (double)y;
       pts2[2 * index + 0] = (double)x + flow->u[y * flow->stride + x];
       pts2[2 * index + 1] = (double)y + flow->v[y * flow->stride + x];
       index++;
     }
   }

   // Check that we filled the expected number of points
   assert(index == num_points);

   bool result = aom_fit_motion_model(type, num_points, pts1, pts2, mat);

   aom_free(pts1);
   aom_free(pts2);
   return result;
 }
	/*
	* Copyright (c) 2022, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/

	#include "aom_dsp/aom_dsp_common.h"
	#include "aom_dsp/flow_estimation/disflow.h"
	#include "aom_dsp/flow_estimation/corner_detect.h"
	#include "aom_dsp/flow_estimation/pyramid.h"
	#include "aom_dsp/flow_estimation/ransac.h"
	#include "aom_mem/aom_mem.h"

	#include "config/av1_rtcd.h"

	// TODO(rachelbarker): Move needed code from av1/ to aom_dsp/
	#include "av1/common/resize.h"

	#include <assert.h>

	// Size of square patches in the disflow dense grid
	#define PATCH_SIZE 8
	// Center point of square patch
	#define PATCH_CENTER ((PATCH_SIZE + 1) >> 1)
	// Step size between patches, lower value means greater patch overlap
	#define PATCH_STEP 1
	// Warp error convergence threshold for disflow
	#define DISFLOW_ERROR_TR 0.01
	// Max number of iterations if warp convergence is not found
	// TODO(rachelbarker): Experiment with different numbers of pyramid levels
	// and numbers of refinement steps per level
	#define DISFLOW_MAX_ITR 10

	// Don't use points around the frame border since they are less reliable
	static INLINE int valid_point(int x, int y, int width, int height) {
	return (x > (PATCH_SIZE + PATCH_CENTER)) &&
	(x < (width - PATCH_SIZE - PATCH_CENTER)) &&
	(y > (PATCH_SIZE + PATCH_CENTER)) &&
	(y < (height - PATCH_SIZE - PATCH_CENTER));
	}

	static int determine_disflow_correspondence(int *frm_corners,
	int num_frm_corners, double *flow_u,
	double *flow_v, int width,
	int height, int stride,
	Correspondence *correspondences) {
	int num_correspondences = 0;
	int x, y;
	for (int i = 0; i < num_frm_corners; ++i) {
	x = frm_corners[2 * i];
	y = frm_corners[2 * i + 1];
	if (valid_point(x, y, width, height)) {
	correspondences[num_correspondences].x = x;
	correspondences[num_correspondences].y = y;
	correspondences[num_correspondences].rx = x + flow_u[y * stride + x];
	correspondences[num_correspondences].ry = y + flow_v[y * stride + x];
	num_correspondences++;
	}
	}
	return num_correspondences;
	}

	static void getCubicKernel(double x, double *kernel) {
	assert(0 <= x && x < 1);
	double x2 = x * x;
	double x3 = x2 * x;
	kernel[0] = -0.5 * x + x2 - 0.5 * x3;
	kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
	kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
	kernel[3] = -0.5 * x2 + 0.5 * x3;
	}

	static double getCubicValue(double p, double kernel) {
	return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
	kernel[3] * p[3];
	}

	// Warps a block using flow vector [u, v] and computes the mse
	static double compute_warp_and_error(unsigned char ref, unsigned char frm,
	int width, int height, int stride, int x,
	int y, double u, double v, int16_t *dt) {
	unsigned char warped;
	int x_w, y_w;
	double mse = 0;
	int16_t err = 0;

	// Split offset into integer and fractional parts, and compute cubic
	// interpolation kernels
	int u_int = (int)floor(u);
	int v_int = (int)floor(v);
	double u_frac = u - u_int;
	double v_frac = v - v_int;

	double h_kernel[4];
	double v_kernel[4];
	getCubicKernel(u_frac, h_kernel);
	getCubicKernel(v_frac, v_kernel);

	// Storage for intermediate values between the two convolution directions
	double tmp_[PATCH_SIZE * (PATCH_SIZE + 3)];
	double *tmp = tmp_ + PATCH_SIZE; // Offset by one row

	// Clamp coordinates so that all pixels we fetch will remain within the
	// allocated border region, but allow them to go far enough out that
	// the border pixels' values do not change.
	// Since we are calculating an 8x8 block, the bottom-right pixel
	// in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
	// interpolation has 4 taps, meaning that the output of pixel
	// (x_w, y_w) depends on the pixels in the range
	// ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
	//
	// Thus the most extreme coordinates which will be fetched are
	// (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
	int x0 = clamp(x + u_int, -9, width);
	int y0 = clamp(y + v_int, -9, height);

	// Horizontal convolution
	for (int i = -1; i < PATCH_SIZE + 2; ++i) {
	y_w = y0 + i;
	for (int j = 0; j < PATCH_SIZE; ++j) {
	x_w = x0 + j;
	double arr[4];

	arr[0] = (double)ref[y_w * stride + (x_w - 1)];
	arr[1] = (double)ref[y_w * stride + (x_w + 0)];
	arr[2] = (double)ref[y_w * stride + (x_w + 1)];
	arr[3] = (double)ref[y_w * stride + (x_w + 2)];

	tmp[i * PATCH_SIZE + j] = getCubicValue(arr, h_kernel);
	}
	}

	// Vertical convolution
	for (int i = 0; i < PATCH_SIZE; ++i) {
	for (int j = 0; j < PATCH_SIZE; ++j) {
	double p = &tmp[i PATCH_SIZE + j];
	double arr[4] = { p[-PATCH_SIZE], p[0], p[PATCH_SIZE],
	p[2 * PATCH_SIZE] };
	double result = getCubicValue(arr, v_kernel);

	warped = clamp((int)(result + 0.5), 0, 255);
	err = warped - frm[(x + j) + (y + i) * stride];
	mse += err * err;
	dt[i * PATCH_SIZE + j] = err;
	}
	}

	mse /= (PATCH_SIZE * PATCH_SIZE);
	return mse;
	}

	// Computes the components of the system of equations used to solve for
	// a flow vector. This includes:
	// 1.) The hessian matrix for optical flow. This matrix is in the
	// form of:
	//
	// M = \|sum(dx * dx) sum(dx * dy)\|
	// \|sum(dx * dy) sum(dy * dy)\|
	//
	// 2.) b = \|sum(dx * dt)\|
	// \|sum(dy * dt)\|
	// Where the sums are computed over a square window of PATCH_SIZE.
	static INLINE void compute_flow_system(const double *dx, int dx_stride,
	const double *dy, int dy_stride,
	const int16_t *dt, int dt_stride,
	double M, double b) {
	for (int i = 0; i < PATCH_SIZE; i++) {
	for (int j = 0; j < PATCH_SIZE; j++) {
	M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
	M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
	M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];

	b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
	b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
	}
	}

	M[2] = M[1];
	}

	// Solves a general Mx = b where M is a 2x2 matrix and b is a 2x1 matrix
	static INLINE void solve_2x2_system(const double M, const double b,
	double *output_vec) {
	double M_0 = M[0];
	double M_3 = M[3];
	double det = (M_0 * M_3) - (M[1] * M[2]);
	if (det < 1e-5) {
	// Handle singular matrix
	// TODO(sarahparker) compare results using pseudo inverse instead
	M_0 += 1e-10;
	M_3 += 1e-10;
	det = (M_0 * M_3) - (M[1] * M[2]);
	}
	const double det_inv = 1 / det;
	const double mult_b0 = det_inv * b[0];
	const double mult_b1 = det_inv * b[1];
	output_vec[0] = M_3 * mult_b0 - M[1] * mult_b1;
	output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1;
	}

	/*
	static INLINE void image_difference(const uint8_t *src, int src_stride,
	const uint8_t *ref, int ref_stride,
	int16_t *dst, int dst_stride, int height,
	int width) {
	const int block_unit = 8;
	// Take difference in 8x8 blocks to make use of optimized diff function
	for (int i = 0; i < height; i += block_unit) {
	for (int j = 0; j < width; j += block_unit) {
	aom_subtract_block(block_unit, block_unit, dst + i * dst_stride + j,
	dst_stride, src + i * src_stride + j, src_stride,
	ref + i * ref_stride + j, ref_stride);
	}
	}
	}
	*/

	static INLINE void compute_flow_at_point(unsigned char frm, unsigned char ref,
	int x, int y, int width, int height,
	int stride, double u, double v) {
	double M[4] = { 0 };
	double b[2] = { 0 };
	double tmp_output_vec[2] = { 0 };
	double error = 0;
	int16_t dt[PATCH_SIZE * PATCH_SIZE];
	double o_u = *u;
	double o_v = *v;

	double dx_tmp[PATCH_SIZE * PATCH_SIZE];
	double dy_tmp[PATCH_SIZE * PATCH_SIZE];

	// Compute gradients within this patch
	unsigned char frm_patch = &frm[y stride + x];
	av1_convolve_2d_sobel_y_c(frm_patch, stride, dx_tmp, PATCH_SIZE, PATCH_SIZE,
	PATCH_SIZE, 1, 1.0);
	av1_convolve_2d_sobel_y_c(frm_patch, stride, dy_tmp, PATCH_SIZE, PATCH_SIZE,
	PATCH_SIZE, 0, 1.0);

	for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
	error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u,
	*v, dt);
	if (error <= DISFLOW_ERROR_TR) break;
	compute_flow_system(dx_tmp, PATCH_SIZE, dy_tmp, PATCH_SIZE, dt, PATCH_SIZE,
	M, b);
	solve_2x2_system(M, b, tmp_output_vec);
	*u += tmp_output_vec[0];
	*v += tmp_output_vec[1];
	}
	if (fabs(u - o_u) > PATCH_SIZE \|\| fabs(v - o_u) > PATCH_SIZE) {
	*u = o_u;
	*v = o_v;
	}
	}

	static void fill_flow_field_borders(double *flow, int width, int height,
	int stride) {
	// Calculate the bounds of the rectangle which was filled in by
	// compute_flow_field() before calling this function.
	// These indices are inclusive on both ends.
	const int left_index = PATCH_CENTER;
	const int right_index = (width - PATCH_SIZE - 1) + PATCH_CENTER;
	const int top_index = PATCH_CENTER;
	const int bottom_index = (height - PATCH_SIZE - 1) + PATCH_CENTER;

	// Left area
	for (int i = top_index; i <= bottom_index; i += PATCH_STEP) {
	double row = flow + i stride;
	double left = row[left_index];
	for (int j = 0; j < left_index; j++) {
	row[j] = left;
	}
	}

	// Right area
	for (int i = top_index; i <= bottom_index; i += PATCH_STEP) {
	double row = flow + i stride;
	double right = row[right_index];
	for (int j = right_index + 1; j < width; j++) {
	row[j] = right;
	}
	}

	// Top area
	double top_row = flow + top_index stride;
	for (int i = 0; i < top_index; i++) {
	double row = flow + i stride;
	memcpy(row, top_row, width * sizeof(double));
	}

	// Bottom area
	double bottom_row = flow + bottom_index stride;
	for (int i = bottom_index + 1; i < height; i++) {
	double row = flow + i stride;
	memcpy(row, bottom_row, width * sizeof(double));
	}
	}

	// make sure flow_u and flow_v start at 0
	static void compute_flow_field(ImagePyramid frm_pyr, ImagePyramid ref_pyr,
	double flow_u, double flow_v) {
	int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center;
	double *u_upscale =
	aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
	double *v_upscale =
	aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));

	assert(frm_pyr->n_levels == ref_pyr->n_levels);

	// Compute flow field from coarsest to finest level of the pyramid
	#if PATCH_STEP != 1
	// TODO(rachelbarker): This function, as written, only works if PATCH_STEP
	// == 1. For any other value, the border filling and interpolation code will
	// need to be reworked to handle the fact that there will be rows and columns
	// with no flow data
	#error "compute_flow_field() needs updating for PATCH_STEP != 1"
	#endif

	for (int level = frm_pyr->n_levels - 1; level >= 0; --level) {
	cur_width = frm_pyr->widths[level];
	cur_height = frm_pyr->heights[level];
	cur_stride = frm_pyr->strides[level];
	cur_loc = frm_pyr->level_loc[level];

	for (int i = 0; i < cur_height - PATCH_SIZE; i += PATCH_STEP) {
	for (int j = 0; j < cur_width - PATCH_SIZE; j += PATCH_STEP) {
	patch_loc = i * cur_stride + j;
	patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER;
	compute_flow_at_point(frm_pyr->level_buffer + cur_loc,
	ref_pyr->level_buffer + cur_loc, j, i, cur_width,
	cur_height, cur_stride, flow_u + patch_center,
	flow_v + patch_center);
	}
	}

	// Fill in the areas which we haven't explicitly computed, with copies
	// of the outermost values which we did compute
	fill_flow_field_borders(flow_u, cur_width, cur_height, cur_stride);
	fill_flow_field_borders(flow_v, cur_width, cur_height, cur_stride);

	if (level > 0) {
	int h_upscale = frm_pyr->heights[level - 1];
	int w_upscale = frm_pyr->widths[level - 1];
	int s_upscale = frm_pyr->strides[level - 1];
	av1_upscale_plane_double_prec(flow_u, cur_height, cur_width, cur_stride,
	u_upscale, h_upscale, w_upscale, s_upscale);
	av1_upscale_plane_double_prec(flow_v, cur_height, cur_width, cur_stride,
	v_upscale, h_upscale, w_upscale, s_upscale);

	// Multiply all flow vectors by 2.
	// When we move down a pyramid level, the image resolution doubles.
	// Thus we need to double all vectors in order for them to represent
	// the same translation at the next level down
	for (int i = 0; i < h_upscale; i++) {
	for (int j = 0; j < w_upscale; j++) {
	int index = i * s_upscale + j;
	flow_u[index] = u_upscale[index] * 2.0;
	flow_v[index] = v_upscale[index] * 2.0;
	}
	}
	}
	}
	aom_free(u_upscale);
	aom_free(v_upscale);
	}

	FlowField *aom_alloc_flow_field(int width, int height, int stride) {
	FlowField flow = (FlowField )aom_malloc(sizeof(FlowField));
	if (flow == NULL) return NULL;

	flow->width = width;
	flow->height = height;
	flow->stride = stride;

	size_t flow_size = stride * (size_t)height;
	flow->u = aom_calloc(flow_size, sizeof(double));
	flow->v = aom_calloc(flow_size, sizeof(double));

	if (flow->u == NULL \|\| flow->v == NULL) {
	aom_free(flow->u);
	aom_free(flow->v);
	aom_free(flow);
	return NULL;
	}

	return flow;
	}

	void aom_free_flow_field(FlowField *flow) {
	aom_free(flow->u);
	aom_free(flow->v);
	aom_free(flow);
	}

	FlowField aom_compute_flow_field(YV12_BUFFER_CONFIG frm,
	YV12_BUFFER_CONFIG *ref, int bit_depth) {
	const int frm_width = frm->y_width;
	const int frm_height = frm->y_height;
	assert(frm->y_width == ref->y_width);
	assert(frm->y_height == ref->y_height);

	// Compute pyramids if necessary.
	// These are cached alongside the framebuffer to avoid unnecessary
	// recomputation. When the framebuffer is freed, or reused for a new frame,
	// these pyramids will be automatically freed.
	if (!frm->y_pyramid) {
	frm->y_pyramid = aom_compute_pyramid(frm, bit_depth, MAX_PYRAMID_LEVELS);
	assert(frm->y_pyramid);
	}
	if (!ref->y_pyramid) {
	ref->y_pyramid = aom_compute_pyramid(ref, bit_depth, MAX_PYRAMID_LEVELS);
	assert(ref->y_pyramid);
	}

	ImagePyramid *frm_pyr = frm->y_pyramid;
	ImagePyramid *ref_pyr = ref->y_pyramid;

	FlowField *flow =
	aom_alloc_flow_field(frm_width, frm_height, frm_pyr->strides[0]);

	compute_flow_field(frm_pyr, ref_pyr, flow->u, flow->v);

	return flow;
	}

	bool aom_fit_global_model_to_flow_field(FlowField *flow,
	TransformationType type,
	YV12_BUFFER_CONFIG *frm, int bit_depth,
	MotionModel *params_by_motion,
	int num_motions) {
	int num_correspondences;

	if (!frm->corners) {
	aom_find_corners_in_frame(frm, bit_depth);
	}

	// find correspondences between the two images using the flow field
	Correspondence *correspondences =
	aom_malloc(frm->num_corners * sizeof(*correspondences));
	num_correspondences = determine_disflow_correspondence(
	frm->corners, frm->num_corners, flow->u, flow->v, flow->width,
	flow->height, flow->stride, correspondences);
	ransac(correspondences, num_correspondences, type, params_by_motion,
	num_motions);

	aom_free(correspondences);
	// Set num_inliers = 0 for motions with too few inliers so they are ignored.
	for (int i = 0; i < num_motions; ++i) {
	if (params_by_motion[i].num_inliers <
	MIN_INLIER_PROB * num_correspondences) {
	params_by_motion[i].num_inliers = 0;
	}
	}

	// Return true if any one of the motions has inliers.
	for (int i = 0; i < num_motions; ++i) {
	if (params_by_motion[i].num_inliers > 0) return true;
	}
	return false;
	}

	bool aom_fit_local_model_to_flow_field(const FlowField *flow,
	const PixelRect *rect,
	TransformationType type, double *mat) {
	int width = rect_height(rect);
	int height = rect_width(rect);
	int num_points = width * height;

	// TODO(rachelbarker): Downsample if num_points is > some threshold?
	double pts1 = aom_malloc(num_points 2 * sizeof(double));
	double pts2 = aom_malloc(num_points 2 * sizeof(double));
	int index = 0;

	for (int y = rect->top; y < rect->bottom; y++) {
	for (int x = rect->left; x < rect->right; x++) {
	pts1[2 * index + 0] = (double)x;
	pts1[2 * index + 1] = (double)y;
	pts2[2 * index + 0] = (double)x + flow->u[y * flow->stride + x];
	pts2[2 * index + 1] = (double)y + flow->v[y * flow->stride + x];
	index++;
	}
	}

	// Check that we filled the expected number of points
	assert(index == num_points);

	bool result = aom_fit_motion_model(type, num_points, pts1, pts2, mat);

	aom_free(pts1);
	aom_free(pts2);
	return result;
	}