av1/encoder/sparse_linear_solver.c - aom - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include "av1/common/av1_common_int.h"
 #include "av1/encoder/sparse_linear_solver.h"
 #include "config/aom_config.h"
 #include "aom_mem/aom_mem.h"
 #include "av1/common/alloccommon.h"

 #if CONFIG_OPTICAL_FLOW_API
 /*
  * Input:
  * rows: array of row positions
  * cols: array of column positions
  * values: array of element values
  * num_elem: total number of elements in the matrix
  * num_rows: number of rows in the matrix
  * num_cols: number of columns in the matrix
  *
  * Output:
  * sm: pointer to the sparse matrix to be initialized
  *
  * Return: 0  - success
  *         -1 - failed
  */
 int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
                         int num_elem, int num_rows, int num_cols,
                         SPARSE_MTX *sm) {
   sm->n_elem = num_elem;
   sm->n_rows = num_rows;
   sm->n_cols = num_cols;
   if (num_elem == 0) {
     sm->row_pos = NULL;
     sm->col_pos = NULL;
     sm->value = NULL;
     return 0;
   }
   sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos));
   sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos));
   sm->value = aom_calloc(num_elem, sizeof(*sm->value));

   if (!sm->row_pos || !sm->col_pos || !sm->value) {
     av1_free_sparse_mtx_elems(sm);
     return -1;
   }

   memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos));
   memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos));
   memcpy(sm->value, values, num_elem * sizeof(*sm->value));

   return 0;
 }

 /*
  * Combines two sparse matrices (allocating new space).
  *
  * Input:
  * sm1, sm2: matrices to be combined
  * row_offset1, row_offset2: row offset of each matrix in the new matrix
  * col_offset1, col_offset2: column offset of each matrix in the new matrix
  * new_n_rows, new_n_cols: number of rows and columns in the new matrix
  *
  * Output:
  * sm: the combined matrix
  *
  * Return: 0  - success
  *         -1 - failed
  */
 int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
                                 SPARSE_MTX *sm, int row_offset1,
                                 int col_offset1, int row_offset2,
                                 int col_offset2, int new_n_rows,
                                 int new_n_cols) {
   sm->n_elem = sm1->n_elem + sm2->n_elem;
   sm->n_cols = new_n_cols;
   sm->n_rows = new_n_rows;

   if (sm->n_elem == 0) {
     sm->row_pos = NULL;
     sm->col_pos = NULL;
     sm->value = NULL;
     return 0;
   }

   sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos));
   sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos));
   sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value));

   if (!sm->row_pos || !sm->col_pos || !sm->value) {
     av1_free_sparse_mtx_elems(sm);
     return -1;
   }

   for (int i = 0; i < sm1->n_elem; i++) {
     sm->row_pos[i] = sm1->row_pos[i] + row_offset1;
     sm->col_pos[i] = sm1->col_pos[i] + col_offset1;
   }
   memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value));
   int n_elem1 = sm1->n_elem;
   for (int i = 0; i < sm2->n_elem; i++) {
     sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2;
     sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2;
   }
   memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value));
   return 0;
 }

 void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) {
   sm->n_cols = 0;
   sm->n_rows = 0;
   if (sm->n_elem != 0) {
     aom_free(sm->row_pos);
     aom_free(sm->col_pos);
     aom_free(sm->value);
   }
   sm->n_elem = 0;
 }

 /*
  * Calculate matrix and vector multiplication: A*b
  *
  * Input:
  * sm: matrix A
  * srcv: the vector b to be multiplied to
  * dstl: the length of vectors
  *
  * Output:
  * dstv: pointer to the resulting vector
  */
 void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
                               double *dstv, int dstl) {
   memset(dstv, 0, sizeof(*dstv) * dstl);
   for (int i = 0; i < sm->n_elem; i++) {
     dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i];
   }
 }
 /*
  * Calculate matrix and vector multiplication: b*A
  *
  * Input:
  * sm: matrix A
  * srcv: the vector b to be multiplied to
  * dstl: the length of vectors
  *
  * Output:
  * dstv: pointer to the resulting vector
  */
 void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
                              double *dstv, int dstl) {
   memset(dstv, 0, sizeof(*dstv) * dstl);
   for (int i = 0; i < sm->n_elem; i++) {
     dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i];
   }
 }

 /*
  * Calculate inner product of two vectors
  *
  * Input:
  * src1, scr2: the vectors to be multiplied
  * src1l: length of the vectors
  *
  * Output:
  * the inner product
  */
 double av1_vect_vect_multi(const double *src1, int src1l, const double *src2) {
   double result = 0;
   for (int i = 0; i < src1l; i++) {
     result += src1[i] * src2[i];
   }
   return result;
 }

 /*
  * Multiply each element in the matrix sm with a constant c
  */
 void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) {
   for (int i = 0; i < sm->n_elem; i++) {
     sm->value[i] *= c;
   }
 }

 static INLINE void free_solver_local_buf(double *buf1, double *buf2,
                                          double *buf3, double *buf4,
                                          double *buf5, double *buf6,
                                          double *buf7) {
   aom_free(buf1);
   aom_free(buf2);
   aom_free(buf3);
   aom_free(buf4);
   aom_free(buf5);
   aom_free(buf6);
   aom_free(buf7);
 }

 /*
  * Solve for Ax = b
  * no requirement on A
  *
  * Input:
  * A: the sparse matrix
  * b: the vector b
  * bl: length of b
  * x: the vector x
  *
  * Output:
  * x: pointer to the solution vector
  *
  * Return: 0  - success
  *         -1 - failed
  */
 int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
                                      int bl, double *x) {
   double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL,
          *p_hatA = NULL, *x_hat = NULL;
   double alpha, beta, rtr, r_norm_2;
   double denormtemp;

   // initialize
   r = aom_calloc(bl, sizeof(*r));
   r_hat = aom_calloc(bl, sizeof(*r_hat));
   p = aom_calloc(bl, sizeof(*p));
   p_hat = aom_calloc(bl, sizeof(*p_hat));
   Ap = aom_calloc(bl, sizeof(*Ap));
   p_hatA = aom_calloc(bl, sizeof(*p_hatA));
   x_hat = aom_calloc(bl, sizeof(*x_hat));
   if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) {
     free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
     return -1;
   }

   int i;
   for (i = 0; i < bl; i++) {
     r[i] = b[i];
     r_hat[i] = b[i];
     p[i] = r[i];
     p_hat[i] = r_hat[i];
     x[i] = 0;
     x_hat[i] = 0;
   }
   r_norm_2 = av1_vect_vect_multi(r_hat, bl, r);
   for (int k = 0; k < MAX_CG_SP_ITER; k++) {
     rtr = r_norm_2;
     av1_mtx_vect_multi_right(A, p, Ap, bl);
     av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl);

     denormtemp = av1_vect_vect_multi(p_hat, bl, Ap);
     if (denormtemp < 1e-10) break;
     alpha = rtr / denormtemp;
     r_norm_2 = 0;
     for (i = 0; i < bl; i++) {
       x[i] += alpha * p[i];
       x_hat[i] += alpha * p_hat[i];
       r[i] -= alpha * Ap[i];
       r_hat[i] -= alpha * p_hatA[i];
       r_norm_2 += r_hat[i] * r[i];
     }
     if (sqrt(r_norm_2) < 1e-2) {
       break;
     }
     if (rtr < 1e-10) break;
     beta = r_norm_2 / rtr;
     for (i = 0; i < bl; i++) {
       p[i] = r[i] + beta * p[i];
       p_hat[i] = r_hat[i] + beta * p_hat[i];
     }
   }
   // free
   free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
   return 0;
 }

 /*
  * Solve for Ax = b when A is symmetric and positive definite
  *
  * Input:
  * A: the sparse matrix
  * b: the vector b
  * bl: length of b
  * x: the vector x
  *
  * Output:
  * x: pointer to the solution vector
  *
  * Return: 0  - success
  *         -1 - failed
  */
 int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
                                   double *x) {
   double *r = NULL, *p = NULL, *Ap = NULL;
   double alpha, beta, rtr, r_norm_2;
   double denormtemp;

   // initialize
   r = aom_calloc(bl, sizeof(*r));
   p = aom_calloc(bl, sizeof(*p));
   Ap = aom_calloc(bl, sizeof(*Ap));
   if (!r || !p || !Ap) {
     free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
     return -1;
   }

   int i;
   for (i = 0; i < bl; i++) {
     r[i] = b[i];
     p[i] = r[i];
     x[i] = 0;
   }
   r_norm_2 = av1_vect_vect_multi(r, bl, r);
   int k;
   for (k = 0; k < MAX_CG_SP_ITER; k++) {
     rtr = r_norm_2;
     av1_mtx_vect_multi_right(A, p, Ap, bl);
     denormtemp = av1_vect_vect_multi(p, bl, Ap);
     if (denormtemp < 1e-10) break;
     alpha = rtr / denormtemp;
     r_norm_2 = 0;
     for (i = 0; i < bl; i++) {
       x[i] += alpha * p[i];
       r[i] -= alpha * Ap[i];
       r_norm_2 += r[i] * r[i];
     }
     if (r_norm_2 < 1e-8 * bl) break;
     if (rtr < 1e-10) break;
     beta = r_norm_2 / rtr;
     for (i = 0; i < bl; i++) {
       p[i] = r[i] + beta * p[i];
     }
   }
   // free
   free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);

   return 0;
 }

 /*
  * Solve for Ax = b using Jacobi method
  *
  * Input:
  * A: the sparse matrix
  * b: the vector b
  * bl: length of b
  * x: the vector x
  *
  * Output:
  * x: pointer to the solution vector
  *
  * Return: 0  - success
  *         -1 - failed
  */
 int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) {
   double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL,
          *tempx = NULL;
   double resi2;

   diags = aom_calloc(bl, sizeof(*diags));
   Rx = aom_calloc(bl, sizeof(*Rx));
   x_last = aom_calloc(bl, sizeof(*x_last));
   x_cur = aom_calloc(bl, sizeof(*x_cur));

   if (!diags || !Rx || !x_last || !x_cur) {
     free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
     return -1;
   }

   int i;
   memset(x_last, 0, sizeof(*x_last) * bl);
   // get the diagonals of A
   memset(diags, 0, sizeof(*diags) * bl);
   for (int c = 0; c < A->n_elem; c++) {
     if (A->row_pos[c] != A->col_pos[c]) continue;
     diags[A->row_pos[c]] = A->value[c];
   }
   int k;
   for (k = 0; k < MAX_CG_SP_ITER; k++) {
     // R = A - diag(diags)
     // get R*x_last
     memset(Rx, 0, sizeof(*Rx) * bl);
     for (int c = 0; c < A->n_elem; c++) {
       if (A->row_pos[c] == A->col_pos[c]) continue;
       Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c];
     }
     resi2 = 0;
     for (i = 0; i < bl; i++) {
       x_cur[i] = (b[i] - Rx[i]) / diags[i];
       resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]);
     }
     if (resi2 <= 1e-10 * bl) break;
     // swap last & cur buffer ptrs
     tempx = x_last;
     x_last = x_cur;
     x_cur = tempx;
   }
   printf("\n numiter: %d\n", k);
   for (i = 0; i < bl; i++) {
     x[i] = x_cur[i];
   }
   free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
   return 0;
 }

 /*
  * Solve for Ax = b using Steepest descent method
  *
  * Input:
  * A: the sparse matrix
  * b: the vector b
  * bl: length of b
  * x: the vector x
  *
  * Output:
  * x: pointer to the solution vector
  *
  * Return: 0  - success
  *         -1 - failed
  */
 int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
                                 double *x) {
   double *d = NULL, *Ad = NULL, *Ax = NULL;
   double resi2, resi2_last, dAd, temp;

   d = aom_calloc(bl, sizeof(*d));
   Ax = aom_calloc(bl, sizeof(*Ax));
   Ad = aom_calloc(bl, sizeof(*Ad));

   if (!d || !Ax || !Ad) {
     free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
     return -1;
   }

   int i;
   // initialize with 0s
   resi2 = 0;
   for (i = 0; i < bl; i++) {
     x[i] = 0;
     d[i] = b[i];
     resi2 += d[i] * d[i] / bl;
   }
   int k;
   for (k = 0; k < MAX_CG_SP_ITER; k++) {
     // get A*x_last
     av1_mtx_vect_multi_right(A, d, Ad, bl);
     dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad);
     for (i = 0; i < bl; i++) {
       temp = dAd * d[i];
       x[i] = x[i] + temp;
     }
     av1_mtx_vect_multi_right(A, x, Ax, bl);
     resi2_last = resi2;
     resi2 = 0;
     for (i = 0; i < bl; i++) {
       d[i] = b[i] - Ax[i];
       resi2 += d[i] * d[i] / bl;
     }
     if (resi2 <= 1e-8) break;
     if (resi2_last - resi2 < 1e-8) {
       break;
     }
   }
   free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);

   return 0;
 }

 #endif  // CONFIG_OPTICAL_FLOW_API
	/*
	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/
	#include "av1/common/av1_common_int.h"
	#include "av1/encoder/sparse_linear_solver.h"
	#include "config/aom_config.h"
	#include "aom_mem/aom_mem.h"
	#include "av1/common/alloccommon.h"

	#if CONFIG_OPTICAL_FLOW_API
	/*
	* Input:
	* rows: array of row positions
	* cols: array of column positions
	* values: array of element values
	* num_elem: total number of elements in the matrix
	* num_rows: number of rows in the matrix
	* num_cols: number of columns in the matrix
	*
	* Output:
	* sm: pointer to the sparse matrix to be initialized
	*
	* Return: 0 - success
	* -1 - failed
	*/
	int av1_init_sparse_mtx(const int rows, const int cols, const double *values,
	int num_elem, int num_rows, int num_cols,
	SPARSE_MTX *sm) {
	sm->n_elem = num_elem;
	sm->n_rows = num_rows;
	sm->n_cols = num_cols;
	if (num_elem == 0) {
	sm->row_pos = NULL;
	sm->col_pos = NULL;
	sm->value = NULL;
	return 0;
	}
	sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos));
	sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos));
	sm->value = aom_calloc(num_elem, sizeof(*sm->value));

	if (!sm->row_pos \|\| !sm->col_pos \|\| !sm->value) {
	av1_free_sparse_mtx_elems(sm);
	return -1;
	}

	memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos));
	memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos));
	memcpy(sm->value, values, num_elem * sizeof(*sm->value));

	return 0;
	}

	/*
	* Combines two sparse matrices (allocating new space).
	*
	* Input:
	* sm1, sm2: matrices to be combined
	* row_offset1, row_offset2: row offset of each matrix in the new matrix
	* col_offset1, col_offset2: column offset of each matrix in the new matrix
	* new_n_rows, new_n_cols: number of rows and columns in the new matrix
	*
	* Output:
	* sm: the combined matrix
	*
	* Return: 0 - success
	* -1 - failed
	*/
	int av1_init_combine_sparse_mtx(const SPARSE_MTX sm1, const SPARSE_MTX sm2,
	SPARSE_MTX *sm, int row_offset1,
	int col_offset1, int row_offset2,
	int col_offset2, int new_n_rows,
	int new_n_cols) {
	sm->n_elem = sm1->n_elem + sm2->n_elem;
	sm->n_cols = new_n_cols;
	sm->n_rows = new_n_rows;

	if (sm->n_elem == 0) {
	sm->row_pos = NULL;
	sm->col_pos = NULL;
	sm->value = NULL;
	return 0;
	}

	sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos));
	sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos));
	sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value));

	if (!sm->row_pos \|\| !sm->col_pos \|\| !sm->value) {
	av1_free_sparse_mtx_elems(sm);
	return -1;
	}

	for (int i = 0; i < sm1->n_elem; i++) {
	sm->row_pos[i] = sm1->row_pos[i] + row_offset1;
	sm->col_pos[i] = sm1->col_pos[i] + col_offset1;
	}
	memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value));
	int n_elem1 = sm1->n_elem;
	for (int i = 0; i < sm2->n_elem; i++) {
	sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2;
	sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2;
	}
	memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value));
	return 0;
	}

	void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) {
	sm->n_cols = 0;
	sm->n_rows = 0;
	if (sm->n_elem != 0) {
	aom_free(sm->row_pos);
	aom_free(sm->col_pos);
	aom_free(sm->value);
	}
	sm->n_elem = 0;
	}

	/*
	* Calculate matrix and vector multiplication: A*b
	*
	* Input:
	* sm: matrix A
	* srcv: the vector b to be multiplied to
	* dstl: the length of vectors
	*
	* Output:
	* dstv: pointer to the resulting vector
	*/
	void av1_mtx_vect_multi_right(const SPARSE_MTX sm, const double srcv,
	double *dstv, int dstl) {
	memset(dstv, 0, sizeof(dstv) dstl);
	for (int i = 0; i < sm->n_elem; i++) {
	dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i];
	}
	}
	/*
	* Calculate matrix and vector multiplication: b*A
	*
	* Input:
	* sm: matrix A
	* srcv: the vector b to be multiplied to
	* dstl: the length of vectors
	*
	* Output:
	* dstv: pointer to the resulting vector
	*/
	void av1_mtx_vect_multi_left(const SPARSE_MTX sm, const double srcv,
	double *dstv, int dstl) {
	memset(dstv, 0, sizeof(dstv) dstl);
	for (int i = 0; i < sm->n_elem; i++) {
	dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i];
	}
	}

	/*
	* Calculate inner product of two vectors
	*
	* Input:
	* src1, scr2: the vectors to be multiplied
	* src1l: length of the vectors
	*
	* Output:
	* the inner product
	*/
	double av1_vect_vect_multi(const double src1, int src1l, const double src2) {
	double result = 0;
	for (int i = 0; i < src1l; i++) {
	result += src1[i] * src2[i];
	}
	return result;
	}

	/*
	* Multiply each element in the matrix sm with a constant c
	*/
	void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) {
	for (int i = 0; i < sm->n_elem; i++) {
	sm->value[i] *= c;
	}
	}

	static INLINE void free_solver_local_buf(double buf1, double buf2,
	double buf3, double buf4,
	double buf5, double buf6,
	double *buf7) {
	aom_free(buf1);
	aom_free(buf2);
	aom_free(buf3);
	aom_free(buf4);
	aom_free(buf5);
	aom_free(buf6);
	aom_free(buf7);
	}

	/*
	* Solve for Ax = b
	* no requirement on A
	*
	* Input:
	* A: the sparse matrix
	* b: the vector b
	* bl: length of b
	* x: the vector x
	*
	* Output:
	* x: pointer to the solution vector
	*
	* Return: 0 - success
	* -1 - failed
	*/
	int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX A, const double b,
	int bl, double *x) {
	double r = NULL, r_hat = NULL, p = NULL, p_hat = NULL, *Ap = NULL,
	p_hatA = NULL, x_hat = NULL;
	double alpha, beta, rtr, r_norm_2;
	double denormtemp;

	// initialize
	r = aom_calloc(bl, sizeof(*r));
	r_hat = aom_calloc(bl, sizeof(*r_hat));
	p = aom_calloc(bl, sizeof(*p));
	p_hat = aom_calloc(bl, sizeof(*p_hat));
	Ap = aom_calloc(bl, sizeof(*Ap));
	p_hatA = aom_calloc(bl, sizeof(*p_hatA));
	x_hat = aom_calloc(bl, sizeof(*x_hat));
	if (!r \|\| !r_hat \|\| !p \|\| !p_hat \|\| !Ap \|\| !p_hatA \|\| !x_hat) {
	free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
	return -1;
	}

	int i;
	for (i = 0; i < bl; i++) {
	r[i] = b[i];
	r_hat[i] = b[i];
	p[i] = r[i];
	p_hat[i] = r_hat[i];
	x[i] = 0;
	x_hat[i] = 0;
	}
	r_norm_2 = av1_vect_vect_multi(r_hat, bl, r);
	for (int k = 0; k < MAX_CG_SP_ITER; k++) {
	rtr = r_norm_2;
	av1_mtx_vect_multi_right(A, p, Ap, bl);
	av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl);

	denormtemp = av1_vect_vect_multi(p_hat, bl, Ap);
	if (denormtemp < 1e-10) break;
	alpha = rtr / denormtemp;
	r_norm_2 = 0;
	for (i = 0; i < bl; i++) {
	x[i] += alpha * p[i];
	x_hat[i] += alpha * p_hat[i];
	r[i] -= alpha * Ap[i];
	r_hat[i] -= alpha * p_hatA[i];
	r_norm_2 += r_hat[i] * r[i];
	}
	if (sqrt(r_norm_2) < 1e-2) {
	break;
	}
	if (rtr < 1e-10) break;
	beta = r_norm_2 / rtr;
	for (i = 0; i < bl; i++) {
	p[i] = r[i] + beta * p[i];
	p_hat[i] = r_hat[i] + beta * p_hat[i];
	}
	}
	// free
	free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
	return 0;
	}

	/*
	* Solve for Ax = b when A is symmetric and positive definite
	*
	* Input:
	* A: the sparse matrix
	* b: the vector b
	* bl: length of b
	* x: the vector x
	*
	* Output:
	* x: pointer to the solution vector
	*
	* Return: 0 - success
	* -1 - failed
	*/
	int av1_conjugate_gradient_sparse(const SPARSE_MTX A, const double b, int bl,
	double *x) {
	double r = NULL, p = NULL, *Ap = NULL;
	double alpha, beta, rtr, r_norm_2;
	double denormtemp;

	// initialize
	r = aom_calloc(bl, sizeof(*r));
	p = aom_calloc(bl, sizeof(*p));
	Ap = aom_calloc(bl, sizeof(*Ap));
	if (!r \|\| !p \|\| !Ap) {
	free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
	return -1;
	}

	int i;
	for (i = 0; i < bl; i++) {
	r[i] = b[i];
	p[i] = r[i];
	x[i] = 0;
	}
	r_norm_2 = av1_vect_vect_multi(r, bl, r);
	int k;
	for (k = 0; k < MAX_CG_SP_ITER; k++) {
	rtr = r_norm_2;
	av1_mtx_vect_multi_right(A, p, Ap, bl);
	denormtemp = av1_vect_vect_multi(p, bl, Ap);
	if (denormtemp < 1e-10) break;
	alpha = rtr / denormtemp;
	r_norm_2 = 0;
	for (i = 0; i < bl; i++) {
	x[i] += alpha * p[i];
	r[i] -= alpha * Ap[i];
	r_norm_2 += r[i] * r[i];
	}
	if (r_norm_2 < 1e-8 * bl) break;
	if (rtr < 1e-10) break;
	beta = r_norm_2 / rtr;
	for (i = 0; i < bl; i++) {
	p[i] = r[i] + beta * p[i];
	}
	}
	// free
	free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);

	return 0;
	}

	/*
	* Solve for Ax = b using Jacobi method
	*
	* Input:
	* A: the sparse matrix
	* b: the vector b
	* bl: length of b
	* x: the vector x
	*
	* Output:
	* x: pointer to the solution vector
	*
	* Return: 0 - success
	* -1 - failed
	*/
	int av1_jacobi_sparse(const SPARSE_MTX A, const double b, int bl, double *x) {
	double diags = NULL, Rx = NULL, x_last = NULL, x_cur = NULL,
	*tempx = NULL;
	double resi2;

	diags = aom_calloc(bl, sizeof(*diags));
	Rx = aom_calloc(bl, sizeof(*Rx));
	x_last = aom_calloc(bl, sizeof(*x_last));
	x_cur = aom_calloc(bl, sizeof(*x_cur));

	if (!diags \|\| !Rx \|\| !x_last \|\| !x_cur) {
	free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
	return -1;
	}

	int i;
	memset(x_last, 0, sizeof(x_last) bl);
	// get the diagonals of A
	memset(diags, 0, sizeof(diags) bl);
	for (int c = 0; c < A->n_elem; c++) {
	if (A->row_pos[c] != A->col_pos[c]) continue;
	diags[A->row_pos[c]] = A->value[c];
	}
	int k;
	for (k = 0; k < MAX_CG_SP_ITER; k++) {
	// R = A - diag(diags)
	// get R*x_last
	memset(Rx, 0, sizeof(Rx) bl);
	for (int c = 0; c < A->n_elem; c++) {
	if (A->row_pos[c] == A->col_pos[c]) continue;
	Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c];
	}
	resi2 = 0;
	for (i = 0; i < bl; i++) {
	x_cur[i] = (b[i] - Rx[i]) / diags[i];
	resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]);
	}
	if (resi2 <= 1e-10 * bl) break;
	// swap last & cur buffer ptrs
	tempx = x_last;
	x_last = x_cur;
	x_cur = tempx;
	}
	printf("\n numiter: %d\n", k);
	for (i = 0; i < bl; i++) {
	x[i] = x_cur[i];
	}
	free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
	return 0;
	}

	/*
	* Solve for Ax = b using Steepest descent method
	*
	* Input:
	* A: the sparse matrix
	* b: the vector b
	* bl: length of b
	* x: the vector x
	*
	* Output:
	* x: pointer to the solution vector
	*
	* Return: 0 - success
	* -1 - failed
	*/
	int av1_steepest_descent_sparse(const SPARSE_MTX A, const double b, int bl,
	double *x) {
	double d = NULL, Ad = NULL, *Ax = NULL;
	double resi2, resi2_last, dAd, temp;

	d = aom_calloc(bl, sizeof(*d));
	Ax = aom_calloc(bl, sizeof(*Ax));
	Ad = aom_calloc(bl, sizeof(*Ad));

	if (!d \|\| !Ax \|\| !Ad) {
	free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
	return -1;
	}

	int i;
	// initialize with 0s
	resi2 = 0;
	for (i = 0; i < bl; i++) {
	x[i] = 0;
	d[i] = b[i];
	resi2 += d[i] * d[i] / bl;
	}
	int k;
	for (k = 0; k < MAX_CG_SP_ITER; k++) {
	// get A*x_last
	av1_mtx_vect_multi_right(A, d, Ad, bl);
	dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad);
	for (i = 0; i < bl; i++) {
	temp = dAd * d[i];
	x[i] = x[i] + temp;
	}
	av1_mtx_vect_multi_right(A, x, Ax, bl);
	resi2_last = resi2;
	resi2 = 0;
	for (i = 0; i < bl; i++) {
	d[i] = b[i] - Ax[i];
	resi2 += d[i] * d[i] / bl;
	}
	if (resi2 <= 1e-8) break;
	if (resi2_last - resi2 < 1e-8) {
	break;
	}
	}
	free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);

	return 0;
	}

	#endif // CONFIG_OPTICAL_FLOW_API