Move av1_warp_affine_common impl from warp_plane_neon.h The av1_warp_affine_common implementation is only used by av1_warp_affine_neon so move the implementation to that function. Delete warp_affine_horizontal as well, as it is not a common function anymore. Delete horizontal_filter_4x1_f1_beta0 and horizontal_filter_8x1_f1_beta0 in warp_plane_neon_i8mm.c and warp_plane_sve.c as they are not required by the header file anymore. Change-Id: I200f6fa15e6babacff976e079938e318529425ae

diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c
index 497273b..f4c1377 100644
--- a/av1/common/arm/warp_plane_neon.c
+++ b/av1/common/arm/warp_plane_neon.c

@@ -272,13 +272,106 @@
   *res_high = horizontal_add_4d_s32x4(m4567_pairs);
 }
 
+static AOM_FORCE_INLINE void warp_affine_horizontal_neon(
+    const uint8_t *ref, int width, int height, int stride, int p_width,
+    int p_height, int16_t alpha, int16_t beta, const int64_t x4,
+    const int64_t y4, const int i, int16x8_t tmp[]) {
+  const int height_limit = AOMMIN(8, p_height - i) + 7;
+
+  int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+  int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+
+  int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+  sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+  if (warp_affine_special_case(ref, ix4, iy4, width, height, stride,
+                               height_limit, tmp)) {
+    return;
+  }
+
+  static const uint8_t kIotaArr[] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                                      8, 9, 10, 11, 12, 13, 14, 15 };
+  const uint8x16_t indx = vld1q_u8(kIotaArr);
+
+  const int out_of_boundary_left = -(ix4 - 6);
+  const int out_of_boundary_right = (ix4 + 8) - width;
+
+  if (p_width == 4) {
+    if (beta == 0) {
+      if (alpha == 0) {
+        int16x8_t f_s16 =
+            vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
+        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
+      } else {
+        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
+      }
+    } else {
+      if (alpha == 0) {
+        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1,
+                               (sx4 + beta * (k - 3)));
+      } else {
+        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)),
+                               alpha);
+      }
+    }
+  } else {
+    if (beta == 0) {
+      if (alpha == 0) {
+        int16x8_t f_s16 =
+            vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
+        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
+      } else {
+        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
+      }
+    } else {
+      if (alpha == 0) {
+        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1,
+                               (sx4 + beta * (k - 3)));
+      } else {
+        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)),
+                               alpha);
+      }
+    }
+  }
+}
+
 void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
                           int height, int stride, uint8_t *pred, int p_col,
                           int p_row, int p_width, int p_height, int p_stride,
                           int subsampling_x, int subsampling_y,
                           ConvolveParams *conv_params, int16_t alpha,
                           int16_t beta, int16_t gamma, int16_t delta) {
-  av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
-                         p_width, p_height, p_stride, subsampling_x,
-                         subsampling_y, conv_params, alpha, beta, gamma, delta);
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const int is_compound = conv_params->is_compound;
+  uint16_t *const dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+  assert(IMPLIES(is_compound, dst != NULL));
+  assert(IMPLIES(do_average, is_compound));
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      int16x8_t tmp[15];
+      warp_affine_horizontal_neon(ref, width, height, stride, p_width, p_height,
+                                  alpha, beta, x4, y4, i, tmp);
+      warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst,
+                           dst_stride, do_average, use_dist_wtd_comp_avg, gamma,
+                           delta, y4, i, j, tmp, w0, w1);
+    }
+  }
 }

diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h
index 2909df7b..6c50c41 100644
--- a/av1/common/arm/warp_plane_neon.h
+++ b/av1/common/arm/warp_plane_neon.h

@@ -24,24 +24,6 @@
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in,
-                                                           int sx, int alpha);
-
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in,
-                                                           int sx, int alpha);
-
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
-                                                           int sx);
-
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
-                                                           int sx);
-
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16);
-
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16);
-
 static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
                                                     int32x4_t *res, int sy);
 
@@ -95,21 +77,12 @@
   return clamp(iy, 0, height - 1);
 }
 
-static AOM_FORCE_INLINE void warp_affine_horizontal(
-    const uint8_t *ref, int width, int height, int stride, int p_width,
-    int p_height, int16_t alpha, int16_t beta, const int64_t x4,
-    const int64_t y4, const int i, int16x8_t tmp[]) {
+static inline bool warp_affine_special_case(const uint8_t *ref, int32_t ix4,
+                                            int32_t iy4, int width, int height,
+                                            int stride, const int height_limit,
+                                            int16x8_t tmp[]) {
   const int bd = 8;
   const int reduce_bits_horiz = ROUND0_BITS;
-  const int height_limit = AOMMIN(8, p_height - i) + 7;
-
-  int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
-  int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
-
-  int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-  sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-  sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
 
   if (ix4 <= -7) {
     for (int k = 0; k < height_limit; ++k) {
@@ -119,7 +92,7 @@
           ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
       tmp[k] = vdupq_n_s16(dup_val);
     }
-    return;
+    return true;
   } else if (ix4 >= width + 6) {
     for (int k = 0; k < height_limit; ++k) {
       int iy = clamp_iy(iy4 + k - 7, height);
@@ -128,15 +101,11 @@
                             (1 << (FILTER_BITS - reduce_bits_horiz));
       tmp[k] = vdupq_n_s16(dup_val);
     }
-    return;
+    return true;
   }
 
-  static const uint8_t kIotaArr[] = { 0, 1, 2,  3,  4,  5,  6,  7,
-                                      8, 9, 10, 11, 12, 13, 14, 15 };
-  const uint8x16_t indx = vld1q_u8(kIotaArr);
-
-  const int out_of_boundary_left = -(ix4 - 6);
-  const int out_of_boundary_right = (ix4 + 8) - width;
+  return false;
+}
 
 #define APPLY_HORIZONTAL_SHIFT(fn, ...)                                \
   do {                                                                 \
@@ -172,45 +141,6 @@
     }                                                                  \
   } while (0)
 
-  if (p_width == 4) {
-    if (beta == 0) {
-      if (alpha == 0) {
-        int16x8_t f_s16 =
-            vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
-        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
-      } else {
-        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
-      }
-    } else {
-      if (alpha == 0) {
-        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1,
-                               (sx4 + beta * (k - 3)));
-      } else {
-        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)),
-                               alpha);
-      }
-    }
-  } else {
-    if (beta == 0) {
-      if (alpha == 0) {
-        int16x8_t f_s16 =
-            vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
-        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
-      } else {
-        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
-      }
-    } else {
-      if (alpha == 0) {
-        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1,
-                               (sx4 + beta * (k - 3)));
-      } else {
-        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)),
-                               alpha);
-      }
-    }
-  }
-}
-
 static AOM_FORCE_INLINE void warp_affine_vertical(
     uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound,
     uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg,
@@ -339,43 +269,4 @@
   }
 }
 
-static AOM_FORCE_INLINE void av1_warp_affine_common(
-    const int32_t *mat, const uint8_t *ref, int width, int height, int stride,
-    uint8_t *pred, int p_col, int p_row, int p_width, int p_height,
-    int p_stride, int subsampling_x, int subsampling_y,
-    ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma,
-    int16_t delta) {
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const int is_compound = conv_params->is_compound;
-  uint16_t *const dst = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  const int do_average = conv_params->do_average;
-  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
-
-  assert(IMPLIES(is_compound, dst != NULL));
-  assert(IMPLIES(do_average, is_compound));
-
-  for (int i = 0; i < p_height; i += 8) {
-    for (int j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int64_t dst_x =
-          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
-      const int64_t dst_y =
-          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
-
-      const int64_t x4 = dst_x >> subsampling_x;
-      const int64_t y4 = dst_y >> subsampling_y;
-
-      int16x8_t tmp[15];
-      warp_affine_horizontal(ref, width, height, stride, p_width, p_height,
-                             alpha, beta, x4, y4, i, tmp);
-      warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst,
-                           dst_stride, do_average, use_dist_wtd_comp_avg, gamma,
-                           delta, y4, i, j, tmp, w0, w1);
-    }
-  }
-}
-
 #endif  // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_

diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c
index 2d02974..44689e9 100644
--- a/av1/common/arm/warp_plane_neon_i8mm.c
+++ b/av1/common/arm/warp_plane_neon_i8mm.c

@@ -143,10 +143,11 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
+static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
+                                                           int sx) {
   const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
 
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
 
   uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
@@ -166,12 +167,6 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
-                                                           int sx) {
-  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
-  return horizontal_filter_4x1_f1_beta0(in, f_s16);
-}
-
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_6tap_beta0(
     const uint8x16_t in, const int8x16_t filter, const uint8x16x2_t perm_tbl) {
   const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
@@ -219,10 +214,11 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
+static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
+                                                           int sx) {
   const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
 
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
 
   uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
@@ -248,12 +244,6 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
-                                                           int sx) {
-  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
-  return horizontal_filter_8x1_f1_beta0(in, f_s16);
-}
-
 static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
                                                     int32x4_t *res, int sy) {
   int16x4_t s0 = vget_low_s16(src[0]);
@@ -387,8 +377,6 @@
     const uint8_t *ref, int width, int height, int stride, int p_width,
     int p_height, int16_t alpha, int16_t beta, const int64_t x4,
     const int64_t y4, const int i, int16x8_t tmp[]) {
-  const int bd = 8;
-  const int reduce_bits_horiz = ROUND0_BITS;
   const int height_limit = AOMMIN(8, p_height - i) + 7;
 
   int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
@@ -399,23 +387,8 @@
          (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
   sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
 
-  if (ix4 <= -7) {
-    for (int k = 0; k < height_limit; ++k) {
-      int iy = clamp_iy(iy4 + k - 7, height);
-      int16_t dup_val =
-          (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-          ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
-      tmp[k] = vdupq_n_s16(dup_val);
-    }
-    return;
-  } else if (ix4 >= width + 6) {
-    for (int k = 0; k < height_limit; ++k) {
-      int iy = clamp_iy(iy4 + k - 7, height);
-      int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-                        ref[iy * stride + (width - 1)] *
-                            (1 << (FILTER_BITS - reduce_bits_horiz));
-      tmp[k] = vdupq_n_s16(dup_val);
-    }
+  if (warp_affine_special_case(ref, ix4, iy4, width, height, stride,
+                               height_limit, tmp)) {
     return;
   }
 

diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c
index 455e29d..885ffe8 100644
--- a/av1/common/arm/warp_plane_sve.c
+++ b/av1/common/arm/warp_plane_sve.c

@@ -146,10 +146,11 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
+static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
+                                                           int sx) {
   const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
 
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
 
   uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
@@ -169,12 +170,6 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
-                                                           int sx) {
-  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
-  return horizontal_filter_4x1_f1_beta0(in, f_s16);
-}
-
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_6tap_beta0(
     const uint8x16_t in, const int8x16_t filter, const uint8x16x2_t perm_tbl) {
   const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
@@ -222,10 +217,11 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
+static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
+                                                           int sx) {
   const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
 
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
 
   uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
@@ -251,12 +247,6 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
-                                                           int sx) {
-  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
-  return horizontal_filter_8x1_f1_beta0(in, f_s16);
-}
-
 static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
                                                     int32x4_t *res, int sy) {
   int16x4_t s0 = vget_low_s16(src[0]);
@@ -381,8 +371,6 @@
     const uint8_t *ref, int width, int height, int stride, int p_width,
     int p_height, int16_t alpha, int16_t beta, const int64_t x4,
     const int64_t y4, const int i, int16x8_t tmp[]) {
-  const int bd = 8;
-  const int reduce_bits_horiz = ROUND0_BITS;
   const int height_limit = AOMMIN(8, p_height - i) + 7;
 
   int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
@@ -393,23 +381,8 @@
          (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
   sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
 
-  if (ix4 <= -7) {
-    for (int k = 0; k < height_limit; ++k) {
-      int iy = clamp_iy(iy4 + k - 7, height);
-      int16_t dup_val =
-          (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-          ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
-      tmp[k] = vdupq_n_s16(dup_val);
-    }
-    return;
-  } else if (ix4 >= width + 6) {
-    for (int k = 0; k < height_limit; ++k) {
-      int iy = clamp_iy(iy4 + k - 7, height);
-      int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-                        ref[iy * stride + (width - 1)] *
-                            (1 << (FILTER_BITS - reduce_bits_horiz));
-      tmp[k] = vdupq_n_s16(dup_val);
-    }
+  if (warp_affine_special_case(ref, ix4, iy4, width, height, stride,
+                               height_limit, tmp)) {
     return;
   }