rtc: Add optimized scaler for 2:1 horiz only for Bilinear For x86 only. This also fixes a bug with the aom_scaled_2d(), for horiz or vert only scaling. The x/y_q4 was not set to 0 if the height/width is the same (no scale change). So for 2:1 horiz scaling, it was still filtering along the vertical direction. Eightap case to be added, and ARM to be added. Change-Id: I994195e578471c1e898e7fed55c71ea3bb792f8d

commit: c43795cd00633c383898d649c94108a72e3a076f [log] [tgz]
author: Marco Paniconi <marpan@google.com> Mon Nov 06 14:33:17 2023 -0800
committer: Marco Paniconi <marpan@google.com> Wed Nov 29 12:16:08 2023 -0800
tree: 39426e5f0dc75427d39917ffbec8f0389e86d2ae
parent: fbed8eca54b0e56529391ede26a13a921c3a1fb9 [diff]
diff --git a/av1/common/resize.c b/av1/common/resize.c
index f89f7ca..55dfc6f 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c

@@ -1247,9 +1247,11 @@
     uint8_t *dst_buffer = dst->buffers[i];
     const int dst_stride = dst->strides[is_uv];
     for (int y = 0; y < dst_h; y += 16) {
-      const int y_q4 = y * 16 * src_h / dst_h + phase_scaler;
+      const int y_q4 =
+          src_h == dst_h ? 0 : y * 16 * src_h / dst_h + phase_scaler;
       for (int x = 0; x < dst_w; x += 16) {
-        const int x_q4 = x * 16 * src_w / dst_w + phase_scaler;
+        const int x_q4 =
+            src_w == dst_w ? 0 : x * 16 * src_w / dst_w + phase_scaler;
         const uint8_t *src_ptr =
             src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w;
         uint8_t *dst_ptr = dst_buffer + y * dst_stride + x;

diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
index a7fdb5a..622e4ec 100644
--- a/av1/common/x86/resize_ssse3.c
+++ b/av1/common/x86/resize_ssse3.c

@@ -89,6 +89,29 @@
   return temp;
 }
 
+static void scale_plane_2_to_1_phase_0_horiz(const uint8_t *src,
+                                             const ptrdiff_t src_stride,
+                                             uint8_t *dst,
+                                             const ptrdiff_t dst_stride,
+                                             const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi16(0x00FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+      _mm_storeu_si128((__m128i *)dst, d);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += src_stride - 2 * max_width;
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
 static void scale_plane_2_to_1_phase_0(const uint8_t *src,
                                        const ptrdiff_t src_stride, uint8_t *dst,
                                        const ptrdiff_t dst_stride,
@@ -148,6 +171,32 @@
   return _mm_packus_epi16(t4, t5);
 }
 
+static void scale_plane_2_to_1_bilinear_horiz(const uint8_t *src,
+                                              const ptrdiff_t src_stride,
+                                              uint8_t *dst,
+                                              const ptrdiff_t dst_stride,
+                                              const int dst_w, const int dst_h,
+                                              const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+  do {
+    int x = max_width;
+    do {
+      __m128i s[2], d;
+      // Horizontal
+      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+      d = scale_plane_bilinear_kernel(s, c0c1);
+      _mm_storeu_si128((__m128i *)dst, d);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += src_stride - (max_width << 1);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
 static void scale_plane_2_to_1_bilinear(const uint8_t *src,
                                         const ptrdiff_t src_stride,
                                         uint8_t *dst,
@@ -809,13 +858,15 @@
   } while (--y);
 }
 
-// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling
+// There's SIMD optimizations for: (1/4 x 1/4), (1/2 x 1/2), (3/4 x 3/4),
+// horizontal-only downscale (1/2 x 1), and for (2 x 2) upscaling,
 // in SSSE3.
 static INLINE bool has_normative_scaler_ssse3(const int src_width,
                                               const int src_height,
                                               const int dst_width,
                                               const int dst_height) {
   const bool has_normative_scaler =
+      (2 * dst_width == src_width && dst_height == src_height) ||
       (2 * dst_width == src_width && 2 * dst_height == src_height) ||
       (4 * dst_width == src_width && 4 * dst_height == src_height) ||
       (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) ||
@@ -839,7 +890,7 @@
                                    dst->uv_crop_width, dst->uv_crop_height);
   }
 
-  if (!has_normative_scaler) {
+  if (!has_normative_scaler || filter != BILINEAR) {
     av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
     return;
   }
@@ -857,19 +908,30 @@
     const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
     const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
 
-    if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+    if ((2 * dst_w == src_w && 2 * dst_h == src_h) ||
+        (2 * dst_w == src_w && dst_h == src_h)) {
       // 2 to 1
       if (phase == 0) {
-        scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], dst_w,
-                                   dst_h);
+        if (2 * dst_w == src_w && dst_h == src_h)
+          scale_plane_2_to_1_phase_0_horiz(src->buffers[i], src->strides[is_uv],
+                                           dst->buffers[i], dst->strides[is_uv],
+                                           dst_w, dst_h);
+        else
+          scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h);
       } else if (filter == BILINEAR) {
         const int16_t c0 = av1_bilinear_filters[phase][3];
         const int16_t c1 = av1_bilinear_filters[phase][4];
         const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
-        scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
-                                    dst->buffers[i], dst->strides[is_uv], dst_w,
-                                    dst_h, c0c1);
+        if (2 * dst_w == src_w && dst_h == src_h)
+          scale_plane_2_to_1_bilinear_horiz(
+              src->buffers[i], src->strides[is_uv], dst->buffers[i],
+              dst->strides[is_uv], dst_w, dst_h, c0c1);
+        else
+          scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                      dst->buffers[i], dst->strides[is_uv],
+                                      dst_w, dst_h, c0c1);
       } else {
         const int buffer_stride = (dst_y_w + 3) & ~3;
         const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
commit	c43795cd00633c383898d649c94108a72e3a076f	[log] [tgz]
author	Marco Paniconi <marpan@google.com>	Mon Nov 06 14:33:17 2023 -0800
committer	Marco Paniconi <marpan@google.com>	Wed Nov 29 12:16:08 2023 -0800
tree	39426e5f0dc75427d39917ffbec8f0389e86d2ae
parent	fbed8eca54b0e56529391ede26a13a921c3a1fb9 [diff]