av1/common/warped_motion.h - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
 #define AOM_AV1_COMMON_WARPED_MOTION_H_

 #include <stdio.h>
 #include <stdlib.h>
 #include <memory.h>
 #include <math.h>
 #include <assert.h>
 #include <stdbool.h>

 #include "config/aom_config.h"

 #include "aom_ports/mem.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/common/mv.h"
 #include "av1/common/convolve.h"
 #include "av1/common/blockd.h"

 #define LEAST_SQUARES_SAMPLES_MAX_BITS 3
 #define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
 #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
 #define WARPED_MOTION_DEBUG 0
 #define DEFAULT_WMTYPE AFFINE
 #define WARP_ERROR_BLOCK_LOG 5
 #define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)

 #if CONFIG_EXT_WARP_FILTER
 #define EXT_WARP_TAPS 6
 #define EXT_WARP_PHASES_LOG2 6
 #define EXT_WARP_PHASES (1 << EXT_WARP_PHASES_LOG2)
 #define EXT_WARP_ROUND_BITS (WARPEDMODEL_PREC_BITS - EXT_WARP_PHASES_LOG2)

 // The extended warp filter is a 6-tap filter, but we store each kernel with
 // two extra zeros at the end so that each kernel is 16-byte aligned
 #define EXT_WARP_STORAGE_TAPS 8
 #endif  // CONFIG_EXT_WARP_FILTER

 #define DIV_LUT_PREC_BITS 14
 #define DIV_LUT_BITS 8
 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)

 extern const uint16_t div_lut[DIV_LUT_NUM + 1];

 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
 // at precision of DIV_LUT_PREC_BITS along with the shift.
 static INLINE int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
   int64_t f;
   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
                                : get_msb((unsigned int)D));
   // e is obtained from D after resetting the most significant 1 bit.
   const int64_t e = D - ((uint64_t)1 << *shift);
   // Get the most significant DIV_LUT_BITS (8) bits of e into f
   if (*shift > DIV_LUT_BITS)
     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
   else
     f = e << (DIV_LUT_BITS - *shift);
   assert(f <= DIV_LUT_NUM);
   *shift += DIV_LUT_PREC_BITS;
   // Use f as lookup into the precomputed table of multipliers
   return div_lut[f];
 }

 static INLINE int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
   int32_t f;
   *shift = get_msb(D);
   // e is obtained from D after resetting the most significant 1 bit.
   const int32_t e = D - ((uint32_t)1 << *shift);
   // Get the most significant DIV_LUT_BITS (8) bits of e into f
   if (*shift > DIV_LUT_BITS)
     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
   else
     f = e << (DIV_LUT_BITS - *shift);
   assert(f <= DIV_LUT_NUM);
   *shift += DIV_LUT_PREC_BITS;
   // Use f as lookup into the precomputed table of multipliers
   return div_lut[f];
 }

 static INLINE int16_t resolve_divisor_32_CfL(int32_t N, int32_t D,
                                              int16_t shift) {
   int32_t f_n, f_d;
   int ret;

   assert(D >= 0);
   int sign_N = N >= 0 ? 0 : 1;

   if (sign_N) N = -N;

   if (N == 0 || D == 0)
     return 0;
   else {
     int16_t shift_n = get_msb(N);
     int16_t shift_d = get_msb(D);
     // e is obtained from D after resetting the most significant 1 bit.
     const int32_t e_d = D - ((uint32_t)1 << shift_d);
     // Get the most significant DIV_LUT_BITS (8) bits of e into f
     if (shift_d > DIV_LUT_BITS)
       f_d = ROUND_POWER_OF_TWO(e_d, shift_d - DIV_LUT_BITS);
     else
       f_d = e_d << (DIV_LUT_BITS - shift_d);
     assert(f_d <= DIV_LUT_NUM);

     if (shift_n > DIV_LUT_BITS)
       f_n = ROUND_POWER_OF_TWO(N, shift_n - DIV_LUT_BITS);
     else
       f_n = N << (DIV_LUT_BITS - shift_n);

     assert(f_d <= DIV_LUT_NUM);
     assert(f_n <= DIV_LUT_NUM * 2);

     const int shift_add = shift_d - shift_n - shift;

     // The maximum value of `div_lut[f_d] * f_n` is
     // `1 << (DIV_LUT_PREC_BITS + DIV_LUT_BITS + 1)`
     // Hence `shift_add`below is constrained to be <= 1.
     if (shift_add <= 1) {
       ret = (div_lut[f_d] * f_n) >>
             (DIV_LUT_PREC_BITS + DIV_LUT_BITS + shift_add);
     } else {
       ret = 0;
     }
     if (ret >= (2 << shift) - 1) ret = (2 << shift) - 1;

     if (sign_N) ret = -ret;
     return ret;
   }
 }

 #if CONFIG_RELAX_AFFINE_CONSTRAINTS
 extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 7 + 1][8];
 #else
 extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
 #endif  // CONFIG_RELAX_AFFINE_CONSTRAINTS

 DECLARE_ALIGNED(8, extern const int8_t,
                 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);

 #if CONFIG_EXT_WARP_FILTER
 extern const int16_t av1_ext_warped_filter[EXT_WARP_PHASES + 1]
                                           [EXT_WARP_STORAGE_TAPS];
 #endif  // CONFIG_EXT_WARP_FILTER

 static const uint8_t warp_pad_left[14][16] = {
   { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 },
   { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 },
   { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 },
   { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 },
   { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 },
   { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 },
 };

 static const uint8_t warp_pad_right[14][16] = {
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 },
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 },
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 },
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 },
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 },
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 },
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 },
   { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 },
   { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 },
   { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 },
   { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
   { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
   { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
   { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
 };

 // Recompute the translational part of a warp model, so that the center
 // of the current block (determined by `mi_row`, `mi_col`, `bsize`)
 // has an induced motion vector of `mv`
 void av1_set_warp_translation(int mi_row, int mi_col, BLOCK_SIZE bsize, MV mv,
                               WarpedMotionParams *wm);

 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
                        int width, int height, int stride, uint16_t *const pred,
                        int p_col, int p_row, int p_width, int p_height,
                        int p_stride, int subsampling_x, int subsampling_y,
                        int bd, ConvolveParams *conv_params
 #if CONFIG_ACROSS_SCALE_WARP
                        ,
                        const struct scale_factors *sf
 #endif  // CONFIG_ACROSS_SCALE_WARP
 #if CONFIG_OPFL_MEMBW_REDUCTION
                        ,
                        int use_damr_padding, ReferenceArea *ref_area
 #endif  // CONFIG_OPFL_MEMBW_REDUCTION
 #if CONFIG_WARP_BD_BOX
                        ,
                        int use_warp_bd_box, WarpBoundaryBox *warp_bd_box,
                        int use_warp_bd_damr, WarpBoundaryBox *warp_bd_box_damr
 #endif  // CONFIG_WARP_BD_BOX
 );

 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
                 int height, int stride, uint8_t *pred, int p_col, int p_row,
                 int p_width, int p_height, int p_stride, int subsampling_x,
                 int subsampling_y, ConvolveParams *conv_params);
 #if CONFIG_AFFINE_REFINEMENT && CONFIG_EXT_WARP_FILTER
 void av1_warp_plane_ext(WarpedMotionParams *wm, int bd, const uint16_t *ref,
                         int width, int height, int stride, uint16_t *pred,
                         int p_col, int p_row, int p_width, int p_height,
                         int p_stride, int subsampling_x, int subsampling_y,
                         ConvolveParams *conv_params);
 #endif  // CONFIG_AFFINE_REFINEMENT && CONFIG_EXT_WARP_FILTER
 void av1_warp_plane(WarpedMotionParams *wm, int bd, const uint16_t *ref,
                     int width, int height, int stride, uint16_t *pred,
                     int p_col, int p_row, int p_width, int p_height,
                     int p_stride, int subsampling_x, int subsampling_y,
                     ConvolveParams *conv_params
 #if CONFIG_ACROSS_SCALE_WARP
                     ,
                     const struct scale_factors *sf
 #endif  // CONFIG_ACROSS_SCALE_WARP
 #if CONFIG_OPFL_MEMBW_REDUCTION
                     ,
                     int use_damr_padding, ReferenceArea *ref_area
 #endif  // CONFIG_OPFL_MEMBW_REDUCTION
 #if CONFIG_WARP_BD_BOX
                     ,
                     int use_warp_bd_box, WarpBoundaryBox *warp_bd_box,
                     int use_warp_bd_damr, WarpBoundaryBox *warp_bd_box_damr
 #endif  // CONFIG_WARP_BD_BOX
 );

 int av1_find_projection(int np, const int *pts1, const int *pts2,
                         BLOCK_SIZE bsize, MV mv, WarpedMotionParams *wm_params,
                         int mi_row, int mi_col
 #if CONFIG_ACROSS_SCALE_WARP
                         ,
                         const struct scale_factors *sf
 #endif  // CONFIG_ACROSS_SCALE_WARP
 );

 int av1_get_shear_params(WarpedMotionParams *wm
 #if CONFIG_ACROSS_SCALE_WARP
                          ,
                          const struct scale_factors *sf
 #endif  // CONFIG_ACROSS_SCALE_WARP
 );

 // Reduce the precision of a warp model, ready for use in the warp filter
 // and for storage. This should be called after the non-translational parameters
 // are calculated, but before av1_set_warp_translation() or
 // av1_get_shear_params() are called
 void av1_reduce_warp_model(WarpedMotionParams *wm);

 #if CONFIG_EXT_WARP_FILTER
 // Check if a model is already properly reduced, according to the same logic
 // used in av1_reduce_warp_model()
 bool av1_is_warp_model_reduced(WarpedMotionParams *wm);
 #endif  // CONFIG_EXT_WARP_FILTER

 int av1_extend_warp_model(const bool neighbor_is_above, const BLOCK_SIZE bsize,
                           const MV *center_mv, const int mi_row,
                           const int mi_col,
                           const WarpedMotionParams *neighbor_wm,
                           WarpedMotionParams *wm_params
 #if CONFIG_ACROSS_SCALE_WARP
                           ,
                           const struct scale_factors *sf
 #endif  // CONFIG_ACROSS_SCALE_WARP
 );

 #if CONFIG_IMPROVED_GLOBAL_MOTION
 // Given a warp model which was initially used at a temporal distance of
 // `in_distance`, rescale it to a new temporal distance of `out_distance`.
 // Both distances are allowed to be negative, but they must be nonzero.
 //
 // The mathematically ideal way to rescale a warp model from one temporal
 // distance to another would be to use a matrix exponential: If we write the
 // input model as a 3x3 matrix M, then the output model should be
 //
 //  ideal output = M ^ (out_distance / in_distance)
 //
 // However, computing a matrix exponential is complicated, especially in
 // fixed point, and so would not be very hardware friendly. In addition,
 // this function is mainly used to predict global motion parameters, with
 // the true values being coded as a delta from this prediction. As the
 // global motion will not be perfectly consistent, there's a limit to how
 // accurate our prediction can be.
 //
 // For these reasons, we approximate the matrix exponential using its
 // first-order Taylor series:
 //
 //  output = I + (M - I) * (out_distance / in_distance)
 //
 // This is far easier to compute, and provides a "good enough" approximation
 // for the models we use in practice, which are all reasonably near to the
 // identity model (all parameters except for the translational part are
 // within +/- 1/2 of the identity).
 static INLINE void av1_scale_warp_model(const WarpedMotionParams *in_params,
                                         int in_distance,
                                         WarpedMotionParams *out_params,
                                         int out_distance) {
   static int param_shift[MAX_PARAMDIM] = {
     GM_TRANS_PREC_DIFF, GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF,
     GM_ALPHA_PREC_DIFF, GM_ALPHA_PREC_DIFF, GM_ALPHA_PREC_DIFF
   };

   static int param_min[MAX_PARAMDIM] = { GM_TRANS_MIN, GM_TRANS_MIN,
                                          GM_ALPHA_MIN, GM_ALPHA_MIN,
                                          GM_ALPHA_MIN, GM_ALPHA_MIN };

   static int param_max[MAX_PARAMDIM] = { GM_TRANS_MAX, GM_TRANS_MAX,
                                          GM_ALPHA_MAX, GM_ALPHA_MAX,
                                          GM_ALPHA_MAX, GM_ALPHA_MAX };

   // If in_distance == 0, then we can't meaningfully scale the model because
   // this would correspond to a division by 0. So we fall back to using the
   // identity model as a reference.
   //
   // Note: Global motion is disabled for temporal distances of 0, so in this
   // situation the input model must be the identity model anyway. Check this
   // constraint here to help keep things internally consistent.
   if (in_distance == 0) {
     *out_params = default_warp_params;
     return;
   }

   // If out_distance == 0, then global motion is disabled, so we shouldn't
   // get to this function
   assert(out_distance != 0);

   // Flip signs so that in_distance is positive.
   // We do this because
   //   scaled_value = (... + divisor/2) / divisor
   // is the simplest way to implement division with round-to-nearest in C,
   // but it only works correctly if the divisor is positive
   if (in_distance < 0) {
     in_distance = -in_distance;
     out_distance = -out_distance;
   }

   out_params->wmtype = in_params->wmtype;
   for (int param = 0; param < MAX_PARAMDIM; param++) {
     int center = default_warp_params.wmmat[param];

     int input = in_params->wmmat[param] - center;
     int divisor = in_distance * (1 << param_shift[param]);
     int output = (int)(((int64_t)input * out_distance + divisor / 2) / divisor);
     output = clamp(output, param_min[param], param_max[param]) *
              (1 << param_shift[param]);

     out_params->wmmat[param] = center + output;
   }
 }
 #endif  // CONFIG_IMPROVED_GLOBAL_MOTION

 int_mv get_warp_motion_vector_xy_pos(const MACROBLOCKD *xd,
                                      const WarpedMotionParams *model,
                                      const int x, const int y,
                                      MvSubpelPrecision precision);
 int get_model_from_corner_mvs(WarpedMotionParams *derive_model, int *pts,
                               int np, int *mvs, const BLOCK_SIZE bsize
 #if CONFIG_ACROSS_SCALE_WARP
                               ,
                               const struct scale_factors *sf
 #endif  // CONFIG_ACROSS_SCALE_WARP
 );
 #endif  // AOM_AV1_COMMON_WARPED_MOTION_H_
	/*
	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/

	#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
	#define AOM_AV1_COMMON_WARPED_MOTION_H_

	#include <stdio.h>
	#include <stdlib.h>
	#include <memory.h>
	#include <math.h>
	#include <assert.h>
	#include <stdbool.h>

	#include "config/aom_config.h"

	#include "aom_ports/mem.h"
	#include "aom_dsp/aom_dsp_common.h"
	#include "av1/common/mv.h"
	#include "av1/common/convolve.h"
	#include "av1/common/blockd.h"

	#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
	#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
	#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
	#define WARPED_MOTION_DEBUG 0
	#define DEFAULT_WMTYPE AFFINE
	#define WARP_ERROR_BLOCK_LOG 5
	#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)

	#if CONFIG_EXT_WARP_FILTER
	#define EXT_WARP_TAPS 6
	#define EXT_WARP_PHASES_LOG2 6
	#define EXT_WARP_PHASES (1 << EXT_WARP_PHASES_LOG2)
	#define EXT_WARP_ROUND_BITS (WARPEDMODEL_PREC_BITS - EXT_WARP_PHASES_LOG2)

	// The extended warp filter is a 6-tap filter, but we store each kernel with
	// two extra zeros at the end so that each kernel is 16-byte aligned
	#define EXT_WARP_STORAGE_TAPS 8
	#endif // CONFIG_EXT_WARP_FILTER

	#define DIV_LUT_PREC_BITS 14
	#define DIV_LUT_BITS 8
	#define DIV_LUT_NUM (1 << DIV_LUT_BITS)

	extern const uint16_t div_lut[DIV_LUT_NUM + 1];

	// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
	// at precision of DIV_LUT_PREC_BITS along with the shift.
	static INLINE int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
	int64_t f;
	*shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
	: get_msb((unsigned int)D));
	// e is obtained from D after resetting the most significant 1 bit.
	const int64_t e = D - ((uint64_t)1 << *shift);
	// Get the most significant DIV_LUT_BITS (8) bits of e into f
	if (*shift > DIV_LUT_BITS)
	f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
	else
	f = e << (DIV_LUT_BITS - *shift);
	assert(f <= DIV_LUT_NUM);
	*shift += DIV_LUT_PREC_BITS;
	// Use f as lookup into the precomputed table of multipliers
	return div_lut[f];
	}

	static INLINE int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
	int32_t f;
	*shift = get_msb(D);
	// e is obtained from D after resetting the most significant 1 bit.
	const int32_t e = D - ((uint32_t)1 << *shift);
	// Get the most significant DIV_LUT_BITS (8) bits of e into f
	if (*shift > DIV_LUT_BITS)
	f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
	else
	f = e << (DIV_LUT_BITS - *shift);
	assert(f <= DIV_LUT_NUM);
	*shift += DIV_LUT_PREC_BITS;
	// Use f as lookup into the precomputed table of multipliers
	return div_lut[f];
	}

	static INLINE int16_t resolve_divisor_32_CfL(int32_t N, int32_t D,
	int16_t shift) {
	int32_t f_n, f_d;
	int ret;

	assert(D >= 0);
	int sign_N = N >= 0 ? 0 : 1;

	if (sign_N) N = -N;

	if (N == 0 \|\| D == 0)
	return 0;
	else {
	int16_t shift_n = get_msb(N);
	int16_t shift_d = get_msb(D);
	// e is obtained from D after resetting the most significant 1 bit.
	const int32_t e_d = D - ((uint32_t)1 << shift_d);
	// Get the most significant DIV_LUT_BITS (8) bits of e into f
	if (shift_d > DIV_LUT_BITS)
	f_d = ROUND_POWER_OF_TWO(e_d, shift_d - DIV_LUT_BITS);
	else
	f_d = e_d << (DIV_LUT_BITS - shift_d);
	assert(f_d <= DIV_LUT_NUM);

	if (shift_n > DIV_LUT_BITS)
	f_n = ROUND_POWER_OF_TWO(N, shift_n - DIV_LUT_BITS);
	else
	f_n = N << (DIV_LUT_BITS - shift_n);

	assert(f_d <= DIV_LUT_NUM);
	assert(f_n <= DIV_LUT_NUM * 2);

	const int shift_add = shift_d - shift_n - shift;

	// The maximum value of `div_lut[f_d] * f_n` is
	// `1 << (DIV_LUT_PREC_BITS + DIV_LUT_BITS + 1)`
	// Hence `shift_add`below is constrained to be <= 1.
	if (shift_add <= 1) {
	ret = (div_lut[f_d] * f_n) >>
	(DIV_LUT_PREC_BITS + DIV_LUT_BITS + shift_add);
	} else {
	ret = 0;
	}
	if (ret >= (2 << shift) - 1) ret = (2 << shift) - 1;

	if (sign_N) ret = -ret;
	return ret;
	}
	}

	#if CONFIG_RELAX_AFFINE_CONSTRAINTS
	extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 7 + 1][8];
	#else
	extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
	#endif // CONFIG_RELAX_AFFINE_CONSTRAINTS

	DECLARE_ALIGNED(8, extern const int8_t,
	av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);

	#if CONFIG_EXT_WARP_FILTER
	extern const int16_t av1_ext_warped_filter[EXT_WARP_PHASES + 1]
	[EXT_WARP_STORAGE_TAPS];
	#endif // CONFIG_EXT_WARP_FILTER

	static const uint8_t warp_pad_left[14][16] = {
	{ 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 },
	{ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 },
	{ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 },
	{ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 },
	{ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 },
	{ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 },
	};

	static const uint8_t warp_pad_right[14][16] = {
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 },
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 },
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 },
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 },
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 },
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 },
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 },
	{ 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 },
	{ 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 },
	{ 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 },
	{ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
	{ 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
	{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
	{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
	};

	// Recompute the translational part of a warp model, so that the center
	// of the current block (determined by `mi_row`, `mi_col`, `bsize`)
	// has an induced motion vector of `mv`
	void av1_set_warp_translation(int mi_row, int mi_col, BLOCK_SIZE bsize, MV mv,
	WarpedMotionParams *wm);

	void highbd_warp_plane(WarpedMotionParams wm, const uint16_t const ref,
	int width, int height, int stride, uint16_t *const pred,
	int p_col, int p_row, int p_width, int p_height,
	int p_stride, int subsampling_x, int subsampling_y,
	int bd, ConvolveParams *conv_params
	#if CONFIG_ACROSS_SCALE_WARP
	,
	const struct scale_factors *sf
	#endif // CONFIG_ACROSS_SCALE_WARP
	#if CONFIG_OPFL_MEMBW_REDUCTION
	,
	int use_damr_padding, ReferenceArea *ref_area
	#endif // CONFIG_OPFL_MEMBW_REDUCTION
	#if CONFIG_WARP_BD_BOX
	,
	int use_warp_bd_box, WarpBoundaryBox *warp_bd_box,
	int use_warp_bd_damr, WarpBoundaryBox *warp_bd_box_damr
	#endif // CONFIG_WARP_BD_BOX
	);

	void warp_plane(WarpedMotionParams wm, const uint8_t const ref, int width,
	int height, int stride, uint8_t *pred, int p_col, int p_row,
	int p_width, int p_height, int p_stride, int subsampling_x,
	int subsampling_y, ConvolveParams *conv_params);
	#if CONFIG_AFFINE_REFINEMENT && CONFIG_EXT_WARP_FILTER
	void av1_warp_plane_ext(WarpedMotionParams wm, int bd, const uint16_t ref,
	int width, int height, int stride, uint16_t *pred,
	int p_col, int p_row, int p_width, int p_height,
	int p_stride, int subsampling_x, int subsampling_y,
	ConvolveParams *conv_params);
	#endif // CONFIG_AFFINE_REFINEMENT && CONFIG_EXT_WARP_FILTER
	void av1_warp_plane(WarpedMotionParams wm, int bd, const uint16_t ref,
	int width, int height, int stride, uint16_t *pred,
	int p_col, int p_row, int p_width, int p_height,
	int p_stride, int subsampling_x, int subsampling_y,
	ConvolveParams *conv_params
	#if CONFIG_ACROSS_SCALE_WARP
	,
	const struct scale_factors *sf
	#endif // CONFIG_ACROSS_SCALE_WARP
	#if CONFIG_OPFL_MEMBW_REDUCTION
	,
	int use_damr_padding, ReferenceArea *ref_area
	#endif // CONFIG_OPFL_MEMBW_REDUCTION
	#if CONFIG_WARP_BD_BOX
	,
	int use_warp_bd_box, WarpBoundaryBox *warp_bd_box,
	int use_warp_bd_damr, WarpBoundaryBox *warp_bd_box_damr
	#endif // CONFIG_WARP_BD_BOX
	);

	int av1_find_projection(int np, const int pts1, const int pts2,
	BLOCK_SIZE bsize, MV mv, WarpedMotionParams *wm_params,
	int mi_row, int mi_col
	#if CONFIG_ACROSS_SCALE_WARP
	,
	const struct scale_factors *sf
	#endif // CONFIG_ACROSS_SCALE_WARP
	);

	int av1_get_shear_params(WarpedMotionParams *wm
	#if CONFIG_ACROSS_SCALE_WARP
	,
	const struct scale_factors *sf
	#endif // CONFIG_ACROSS_SCALE_WARP
	);

	// Reduce the precision of a warp model, ready for use in the warp filter
	// and for storage. This should be called after the non-translational parameters
	// are calculated, but before av1_set_warp_translation() or
	// av1_get_shear_params() are called
	void av1_reduce_warp_model(WarpedMotionParams *wm);

	#if CONFIG_EXT_WARP_FILTER
	// Check if a model is already properly reduced, according to the same logic
	// used in av1_reduce_warp_model()
	bool av1_is_warp_model_reduced(WarpedMotionParams *wm);
	#endif // CONFIG_EXT_WARP_FILTER

	int av1_extend_warp_model(const bool neighbor_is_above, const BLOCK_SIZE bsize,
	const MV *center_mv, const int mi_row,
	const int mi_col,
	const WarpedMotionParams *neighbor_wm,
	WarpedMotionParams *wm_params
	#if CONFIG_ACROSS_SCALE_WARP
	,
	const struct scale_factors *sf
	#endif // CONFIG_ACROSS_SCALE_WARP
	);

	#if CONFIG_IMPROVED_GLOBAL_MOTION
	// Given a warp model which was initially used at a temporal distance of
	// `in_distance`, rescale it to a new temporal distance of `out_distance`.
	// Both distances are allowed to be negative, but they must be nonzero.
	//
	// The mathematically ideal way to rescale a warp model from one temporal
	// distance to another would be to use a matrix exponential: If we write the
	// input model as a 3x3 matrix M, then the output model should be
	//
	// ideal output = M ^ (out_distance / in_distance)
	//
	// However, computing a matrix exponential is complicated, especially in
	// fixed point, and so would not be very hardware friendly. In addition,
	// this function is mainly used to predict global motion parameters, with
	// the true values being coded as a delta from this prediction. As the
	// global motion will not be perfectly consistent, there's a limit to how
	// accurate our prediction can be.
	//
	// For these reasons, we approximate the matrix exponential using its
	// first-order Taylor series:
	//
	// output = I + (M - I) * (out_distance / in_distance)
	//
	// This is far easier to compute, and provides a "good enough" approximation
	// for the models we use in practice, which are all reasonably near to the
	// identity model (all parameters except for the translational part are
	// within +/- 1/2 of the identity).
	static INLINE void av1_scale_warp_model(const WarpedMotionParams *in_params,
	int in_distance,
	WarpedMotionParams *out_params,
	int out_distance) {
	static int param_shift[MAX_PARAMDIM] = {
	GM_TRANS_PREC_DIFF, GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF,
	GM_ALPHA_PREC_DIFF, GM_ALPHA_PREC_DIFF, GM_ALPHA_PREC_DIFF
	};

	static int param_min[MAX_PARAMDIM] = { GM_TRANS_MIN, GM_TRANS_MIN,
	GM_ALPHA_MIN, GM_ALPHA_MIN,
	GM_ALPHA_MIN, GM_ALPHA_MIN };

	static int param_max[MAX_PARAMDIM] = { GM_TRANS_MAX, GM_TRANS_MAX,
	GM_ALPHA_MAX, GM_ALPHA_MAX,
	GM_ALPHA_MAX, GM_ALPHA_MAX };

	// If in_distance == 0, then we can't meaningfully scale the model because
	// this would correspond to a division by 0. So we fall back to using the
	// identity model as a reference.
	//
	// Note: Global motion is disabled for temporal distances of 0, so in this
	// situation the input model must be the identity model anyway. Check this
	// constraint here to help keep things internally consistent.
	if (in_distance == 0) {
	*out_params = default_warp_params;
	return;
	}

	// If out_distance == 0, then global motion is disabled, so we shouldn't
	// get to this function
	assert(out_distance != 0);

	// Flip signs so that in_distance is positive.
	// We do this because
	// scaled_value = (... + divisor/2) / divisor
	// is the simplest way to implement division with round-to-nearest in C,
	// but it only works correctly if the divisor is positive
	if (in_distance < 0) {
	in_distance = -in_distance;
	out_distance = -out_distance;
	}

	out_params->wmtype = in_params->wmtype;
	for (int param = 0; param < MAX_PARAMDIM; param++) {
	int center = default_warp_params.wmmat[param];

	int input = in_params->wmmat[param] - center;
	int divisor = in_distance * (1 << param_shift[param]);
	int output = (int)(((int64_t)input * out_distance + divisor / 2) / divisor);
	output = clamp(output, param_min[param], param_max[param]) *
	(1 << param_shift[param]);

	out_params->wmmat[param] = center + output;
	}
	}
	#endif // CONFIG_IMPROVED_GLOBAL_MOTION

	int_mv get_warp_motion_vector_xy_pos(const MACROBLOCKD *xd,
	const WarpedMotionParams *model,
	const int x, const int y,
	MvSubpelPrecision precision);
	int get_model_from_corner_mvs(WarpedMotionParams derive_model, int pts,
	int np, int *mvs, const BLOCK_SIZE bsize
	#if CONFIG_ACROSS_SCALE_WARP
	,
	const struct scale_factors *sf
	#endif // CONFIG_ACROSS_SCALE_WARP
	);
	#endif // AOM_AV1_COMMON_WARPED_MOTION_H_