av1/common/arm/reconintra_neon.c - aom.git - Git at Google

 /*
  * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <arm_neon.h>
 #include <assert.h>

 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"

 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"

 #define MAX_UPSAMPLE_SZ 16
 #define FILTER_INTRA_SCALE_BITS 4

 // These kernels are a transposed version of those defined in reconintra.c,
 // with the absolute value of the negatives taken in the top row.
 DECLARE_ALIGNED(16, static const uint8_t,
                 av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = {
   // clang-format off
   {
       {  6,  5,  3,  3,  4,  3,  3,  3 },
       { 10,  2,  1,  1,  6,  2,  2,  1 },
       {  0, 10,  1,  1,  0,  6,  2,  2 },
       {  0,  0, 10,  2,  0,  0,  6,  2 },
       {  0,  0,  0, 10,  0,  0,  0,  6 },
       { 12,  9,  7,  5,  2,  2,  2,  3 },
       {  0,  0,  0,  0, 12,  9,  7,  5 }
   },
   {
       { 10,  6,  4,  2, 10,  6,  4,  2 },
       { 16,  0,  0,  0, 16,  0,  0,  0 },
       {  0, 16,  0,  0,  0, 16,  0,  0 },
       {  0,  0, 16,  0,  0,  0, 16,  0 },
       {  0,  0,  0, 16,  0,  0,  0, 16 },
       { 10,  6,  4,  2,  0,  0,  0,  0 },
       {  0,  0,  0,  0, 10,  6,  4,  2 }
   },
   {
       {  8,  8,  8,  8,  4,  4,  4,  4 },
       {  8,  0,  0,  0,  4,  0,  0,  0 },
       {  0,  8,  0,  0,  0,  4,  0,  0 },
       {  0,  0,  8,  0,  0,  0,  4,  0 },
       {  0,  0,  0,  8,  0,  0,  0,  4 },
       { 16, 16, 16, 16,  0,  0,  0,  0 },
       {  0,  0,  0,  0, 16, 16, 16, 16 }
   },
   {
       {  2,  1,  1,  0,  1,  1,  1,  1 },
       {  8,  3,  2,  1,  4,  3,  2,  2 },
       {  0,  8,  3,  2,  0,  4,  3,  2 },
       {  0,  0,  8,  3,  0,  0,  4,  3 },
       {  0,  0,  0,  8,  0,  0,  0,  4 },
       { 10,  6,  4,  2,  3,  4,  4,  3 },
       {  0,  0,  0,  0, 10,  6,  4,  3 }
   },
   {
       { 12, 10,  9,  8, 10,  9,  8,  7 },
       { 14,  0,  0,  0, 12,  1,  0,  0 },
       {  0, 14,  0,  0,  0, 12,  0,  0 },
       {  0,  0, 14,  0,  0,  0, 12,  1 },
       {  0,  0,  0, 14,  0,  0,  0, 12 },
       { 14, 12, 11, 10,  0,  0,  1,  1 },
       {  0,  0,  0,  0, 14, 12, 11,  9 }
   }
   // clang-format on
 };

 static inline uint8x8_t filter_intra_predictor(
     uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4,
     uint8x8_t s5, uint8x8_t s6, const uint8x8_t f0, const uint8x8_t f1,
     const uint8x8_t f2, const uint8x8_t f3, const uint8x8_t f4,
     const uint8x8_t f5, const uint8x8_t f6) {
   uint16x8_t acc = vmull_u8(s1, f1);
   // First row of each filter has all negative values so subtract.
   acc = vmlsl_u8(acc, s0, f0);
   acc = vmlal_u8(acc, s2, f2);
   acc = vmlal_u8(acc, s3, f3);
   acc = vmlal_u8(acc, s4, f4);
   acc = vmlal_u8(acc, s5, f5);
   acc = vmlal_u8(acc, s6, f6);

   return vqrshrun_n_s16(vreinterpretq_s16_u16(acc), FILTER_INTRA_SCALE_BITS);
 }

 void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
                                      TX_SIZE tx_size, const uint8_t *above,
                                      const uint8_t *left, int mode) {
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   assert(width <= 32 && height <= 32);

   const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]);
   const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]);
   const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]);
   const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]);
   const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]);
   const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]);
   const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]);

   // Computing 4 cols per iteration (instead of 8) for 8x<h> blocks is faster.
   if (width <= 8) {
     uint8x8_t s0 = vdup_n_u8(above[-1]);
     uint8x8_t s5 = vdup_n_u8(left[0]);
     uint8x8_t s6 = vdup_n_u8(left[1]);

     int c = 0;
     do {
       uint8x8_t s1234 = load_unaligned_u8_4x1(above + c);
       uint8x8_t s1 = vdup_lane_u8(s1234, 0);
       uint8x8_t s2 = vdup_lane_u8(s1234, 1);
       uint8x8_t s3 = vdup_lane_u8(s1234, 2);
       uint8x8_t s4 = vdup_lane_u8(s1234, 3);

       uint8x8_t res = filter_intra_predictor(s0, s1, s2, s3, s4, s5, s6, f0, f1,
                                              f2, f3, f4, f5, f6);

       store_u8x4_strided_x2(dst + c, stride, res);

       s0 = s4;
       s5 = vdup_lane_u8(res, 3);
       s6 = vdup_lane_u8(res, 7);

       c += 4;
     } while (c < width);

     int r = 2;
     do {
       s0 = vdup_n_u8(left[r - 1]);
       s5 = vdup_n_u8(left[r + 0]);
       s6 = vdup_n_u8(left[r + 1]);

       c = 0;
       do {
         uint8x8_t s1234 = load_u8_4x1(dst + (r - 1) * stride + c);
         uint8x8_t s1 = vdup_lane_u8(s1234, 0);
         uint8x8_t s2 = vdup_lane_u8(s1234, 1);
         uint8x8_t s3 = vdup_lane_u8(s1234, 2);
         uint8x8_t s4 = vdup_lane_u8(s1234, 3);

         uint8x8_t res = filter_intra_predictor(s0, s1, s2, s3, s4, s5, s6, f0,
                                                f1, f2, f3, f4, f5, f6);

         store_u8x4_strided_x2(dst + r * stride + c, stride, res);

         s0 = s4;
         s5 = vdup_lane_u8(res, 3);
         s6 = vdup_lane_u8(res, 7);

         c += 4;
       } while (c < width);

       r += 2;
     } while (r < height);
   } else {
     uint8x8_t s0_lo = vdup_n_u8(above[-1]);
     uint8x8_t s5_lo = vdup_n_u8(left[0]);
     uint8x8_t s6_lo = vdup_n_u8(left[1]);

     int c = 0;
     do {
       uint8x8_t s1234 = vld1_u8(above + c);
       uint8x8_t s1_lo = vdup_lane_u8(s1234, 0);
       uint8x8_t s2_lo = vdup_lane_u8(s1234, 1);
       uint8x8_t s3_lo = vdup_lane_u8(s1234, 2);
       uint8x8_t s4_lo = vdup_lane_u8(s1234, 3);

       uint8x8_t res_lo =
           filter_intra_predictor(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo,
                                  s6_lo, f0, f1, f2, f3, f4, f5, f6);

       uint8x8_t s0_hi = s4_lo;
       uint8x8_t s1_hi = vdup_lane_u8(s1234, 4);
       uint8x8_t s2_hi = vdup_lane_u8(s1234, 5);
       uint8x8_t s3_hi = vdup_lane_u8(s1234, 6);
       uint8x8_t s4_hi = vdup_lane_u8(s1234, 7);
       uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3);
       uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7);

       uint8x8_t res_hi =
           filter_intra_predictor(s0_hi, s1_hi, s2_hi, s3_hi, s4_hi, s5_hi,
                                  s6_hi, f0, f1, f2, f3, f4, f5, f6);

       uint32x2x2_t res =
           vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi));

       vst1_u8(dst + 0 * stride + c, vreinterpret_u8_u32(res.val[0]));
       vst1_u8(dst + 1 * stride + c, vreinterpret_u8_u32(res.val[1]));

       s0_lo = s4_hi;
       s5_lo = vdup_lane_u8(res_hi, 3);
       s6_lo = vdup_lane_u8(res_hi, 7);
       c += 8;
     } while (c < width);

     int r = 2;
     do {
       s0_lo = vdup_n_u8(left[r - 1]);
       s5_lo = vdup_n_u8(left[r + 0]);
       s6_lo = vdup_n_u8(left[r + 1]);

       c = 0;
       do {
         uint8x8_t s1234 = vld1_u8(dst + (r - 1) * stride + c);
         uint8x8_t s1_lo = vdup_lane_u8(s1234, 0);
         uint8x8_t s2_lo = vdup_lane_u8(s1234, 1);
         uint8x8_t s3_lo = vdup_lane_u8(s1234, 2);
         uint8x8_t s4_lo = vdup_lane_u8(s1234, 3);

         uint8x8_t res_lo =
             filter_intra_predictor(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo,
                                    s6_lo, f0, f1, f2, f3, f4, f5, f6);

         uint8x8_t s0_hi = s4_lo;
         uint8x8_t s1_hi = vdup_lane_u8(s1234, 4);
         uint8x8_t s2_hi = vdup_lane_u8(s1234, 5);
         uint8x8_t s3_hi = vdup_lane_u8(s1234, 6);
         uint8x8_t s4_hi = vdup_lane_u8(s1234, 7);
         uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3);
         uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7);

         uint8x8_t res_hi =
             filter_intra_predictor(s0_hi, s1_hi, s2_hi, s3_hi, s4_hi, s5_hi,
                                    s6_hi, f0, f1, f2, f3, f4, f5, f6);

         uint32x2x2_t res =
             vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi));

         vst1_u8(dst + (r + 0) * stride + c, vreinterpret_u8_u32(res.val[0]));
         vst1_u8(dst + (r + 1) * stride + c, vreinterpret_u8_u32(res.val[1]));

         s0_lo = s4_hi;
         s5_lo = vdup_lane_u8(res_hi, 3);
         s6_lo = vdup_lane_u8(res_hi, 7);
         c += 8;
       } while (c < width);

       r += 2;
     } while (r < height);
   }
 }

 void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
   if (!strength) return;
   assert(sz >= 0 && sz <= 129);

   uint8_t edge[160];  // Max value of sz + enough padding for vector accesses.
   memcpy(edge + 1, p, sz * sizeof(*p));

   // Populate extra space appropriately.
   edge[0] = edge[1];
   edge[sz + 1] = edge[sz];
   edge[sz + 2] = edge[sz];

   // Don't overwrite first pixel.
   uint8_t *dst = p + 1;
   sz--;

   if (strength == 1) {  // Filter: {4, 8, 4}.
     const uint8_t *src = edge + 1;

     while (sz >= 8) {
       uint8x8_t s0 = vld1_u8(src);
       uint8x8_t s1 = vld1_u8(src + 1);
       uint8x8_t s2 = vld1_u8(src + 2);

       // Make use of the identity:
       // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
       uint16x8_t t0 = vaddl_u8(s0, s2);
       uint16x8_t t1 = vaddl_u8(s1, s1);
       uint16x8_t sum = vaddq_u16(t0, t1);
       uint8x8_t res = vrshrn_n_u16(sum, 2);

       vst1_u8(dst, res);

       src += 8;
       dst += 8;
       sz -= 8;
     }

     if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
       uint8x8_t s0 = vld1_u8(src);
       uint8x8_t s1 = vld1_u8(src + 1);
       uint8x8_t s2 = vld1_u8(src + 2);

       uint16x8_t t0 = vaddl_u8(s0, s2);
       uint16x8_t t1 = vaddl_u8(s1, s1);
       uint16x8_t sum = vaddq_u16(t0, t1);
       uint8x8_t res = vrshrn_n_u16(sum, 2);

       // Mask off out-of-bounds indices.
       uint8x8_t current_dst = vld1_u8(dst);
       uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
       res = vbsl_u8(mask, res, current_dst);

       vst1_u8(dst, res);
     }
   } else if (strength == 2) {  // Filter: {5, 6, 5}.
     const uint8_t *src = edge + 1;

     const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } };

     while (sz >= 8) {
       uint8x8_t s0 = vld1_u8(src);
       uint8x8_t s1 = vld1_u8(src + 1);
       uint8x8_t s2 = vld1_u8(src + 2);

       uint16x8_t accum = vmull_u8(s0, filter.val[0]);
       accum = vmlal_u8(accum, s1, filter.val[1]);
       accum = vmlal_u8(accum, s2, filter.val[2]);
       uint8x8_t res = vrshrn_n_u16(accum, 4);

       vst1_u8(dst, res);

       src += 8;
       dst += 8;
       sz -= 8;
     }

     if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
       uint8x8_t s0 = vld1_u8(src);
       uint8x8_t s1 = vld1_u8(src + 1);
       uint8x8_t s2 = vld1_u8(src + 2);

       uint16x8_t accum = vmull_u8(s0, filter.val[0]);
       accum = vmlal_u8(accum, s1, filter.val[1]);
       accum = vmlal_u8(accum, s2, filter.val[2]);
       uint8x8_t res = vrshrn_n_u16(accum, 4);

       // Mask off out-of-bounds indices.
       uint8x8_t current_dst = vld1_u8(dst);
       uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
       res = vbsl_u8(mask, res, current_dst);

       vst1_u8(dst, res);
     }
   } else {  // Filter {2, 4, 4, 4, 2}.
     const uint8_t *src = edge;

     while (sz >= 8) {
       uint8x8_t s0 = vld1_u8(src);
       uint8x8_t s1 = vld1_u8(src + 1);
       uint8x8_t s2 = vld1_u8(src + 2);
       uint8x8_t s3 = vld1_u8(src + 3);
       uint8x8_t s4 = vld1_u8(src + 4);

       // Make use of the identity:
       // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
       uint16x8_t t0 = vaddl_u8(s0, s4);
       uint16x8_t t1 = vaddl_u8(s1, s2);
       t1 = vaddw_u8(t1, s3);
       t1 = vaddq_u16(t1, t1);
       uint16x8_t sum = vaddq_u16(t0, t1);
       uint8x8_t res = vrshrn_n_u16(sum, 3);

       vst1_u8(dst, res);

       src += 8;
       dst += 8;
       sz -= 8;
     }

     if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
       uint8x8_t s0 = vld1_u8(src);
       uint8x8_t s1 = vld1_u8(src + 1);
       uint8x8_t s2 = vld1_u8(src + 2);
       uint8x8_t s3 = vld1_u8(src + 3);
       uint8x8_t s4 = vld1_u8(src + 4);

       uint16x8_t t0 = vaddl_u8(s0, s4);
       uint16x8_t t1 = vaddl_u8(s1, s2);
       t1 = vaddw_u8(t1, s3);
       t1 = vaddq_u16(t1, t1);
       uint16x8_t sum = vaddq_u16(t0, t1);
       uint8x8_t res = vrshrn_n_u16(sum, 3);

       // Mask off out-of-bounds indices.
       uint8x8_t current_dst = vld1_u8(dst);
       uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
       res = vbsl_u8(mask, res, current_dst);

       vst1_u8(dst, res);
     }
   }
 }

 void av1_upsample_intra_edge_neon(uint8_t *p, int sz) {
   if (!sz) return;

   assert(sz <= MAX_UPSAMPLE_SZ);

   uint8_t edge[MAX_UPSAMPLE_SZ + 3];
   const uint8_t *src = edge;

   // Copy p[-1..(sz-1)] and pad out both ends.
   edge[0] = p[-1];
   edge[1] = p[-1];
   memcpy(edge + 2, p, sz);
   edge[sz + 2] = p[sz - 1];
   p[-2] = p[-1];

   uint8_t *dst = p - 1;

   do {
     uint8x8_t s0 = vld1_u8(src);
     uint8x8_t s1 = vld1_u8(src + 1);
     uint8x8_t s2 = vld1_u8(src + 2);
     uint8x8_t s3 = vld1_u8(src + 3);

     int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3));
     int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2));
     t1 = vmulq_n_s16(t1, 9);
     t1 = vsubq_s16(t1, t0);

     uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } };

     vst2_u8(dst, res);

     src += 8;
     dst += 16;
     sz -= 8;
   } while (sz > 0);
 }
	/*
	* Copyright (c) 2020, Alliance for Open Media. All rights reserved.
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <arm_neon.h>
	#include <assert.h>

	#include "config/aom_config.h"
	#include "config/av1_rtcd.h"

	#include "aom/aom_integer.h"
	#include "aom_dsp/arm/mem_neon.h"
	#include "aom_dsp/arm/sum_neon.h"

	#define MAX_UPSAMPLE_SZ 16
	#define FILTER_INTRA_SCALE_BITS 4

	// These kernels are a transposed version of those defined in reconintra.c,
	// with the absolute value of the negatives taken in the top row.
	DECLARE_ALIGNED(16, static const uint8_t,
	av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = {
	// clang-format off
	{
	{ 6, 5, 3, 3, 4, 3, 3, 3 },
	{ 10, 2, 1, 1, 6, 2, 2, 1 },
	{ 0, 10, 1, 1, 0, 6, 2, 2 },
	{ 0, 0, 10, 2, 0, 0, 6, 2 },
	{ 0, 0, 0, 10, 0, 0, 0, 6 },
	{ 12, 9, 7, 5, 2, 2, 2, 3 },
	{ 0, 0, 0, 0, 12, 9, 7, 5 }
	},
	{
	{ 10, 6, 4, 2, 10, 6, 4, 2 },
	{ 16, 0, 0, 0, 16, 0, 0, 0 },
	{ 0, 16, 0, 0, 0, 16, 0, 0 },
	{ 0, 0, 16, 0, 0, 0, 16, 0 },
	{ 0, 0, 0, 16, 0, 0, 0, 16 },
	{ 10, 6, 4, 2, 0, 0, 0, 0 },
	{ 0, 0, 0, 0, 10, 6, 4, 2 }
	},
	{
	{ 8, 8, 8, 8, 4, 4, 4, 4 },
	{ 8, 0, 0, 0, 4, 0, 0, 0 },
	{ 0, 8, 0, 0, 0, 4, 0, 0 },
	{ 0, 0, 8, 0, 0, 0, 4, 0 },
	{ 0, 0, 0, 8, 0, 0, 0, 4 },
	{ 16, 16, 16, 16, 0, 0, 0, 0 },
	{ 0, 0, 0, 0, 16, 16, 16, 16 }
	},
	{
	{ 2, 1, 1, 0, 1, 1, 1, 1 },
	{ 8, 3, 2, 1, 4, 3, 2, 2 },
	{ 0, 8, 3, 2, 0, 4, 3, 2 },
	{ 0, 0, 8, 3, 0, 0, 4, 3 },
	{ 0, 0, 0, 8, 0, 0, 0, 4 },
	{ 10, 6, 4, 2, 3, 4, 4, 3 },
	{ 0, 0, 0, 0, 10, 6, 4, 3 }
	},
	{
	{ 12, 10, 9, 8, 10, 9, 8, 7 },
	{ 14, 0, 0, 0, 12, 1, 0, 0 },
	{ 0, 14, 0, 0, 0, 12, 0, 0 },
	{ 0, 0, 14, 0, 0, 0, 12, 1 },
	{ 0, 0, 0, 14, 0, 0, 0, 12 },
	{ 14, 12, 11, 10, 0, 0, 1, 1 },
	{ 0, 0, 0, 0, 14, 12, 11, 9 }
	}
	// clang-format on
	};

	static inline uint8x8_t filter_intra_predictor(
	uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4,
	uint8x8_t s5, uint8x8_t s6, const uint8x8_t f0, const uint8x8_t f1,
	const uint8x8_t f2, const uint8x8_t f3, const uint8x8_t f4,
	const uint8x8_t f5, const uint8x8_t f6) {
	uint16x8_t acc = vmull_u8(s1, f1);
	// First row of each filter has all negative values so subtract.
	acc = vmlsl_u8(acc, s0, f0);
	acc = vmlal_u8(acc, s2, f2);
	acc = vmlal_u8(acc, s3, f3);
	acc = vmlal_u8(acc, s4, f4);
	acc = vmlal_u8(acc, s5, f5);
	acc = vmlal_u8(acc, s6, f6);

	return vqrshrun_n_s16(vreinterpretq_s16_u16(acc), FILTER_INTRA_SCALE_BITS);
	}

	void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
	TX_SIZE tx_size, const uint8_t *above,
	const uint8_t *left, int mode) {
	const int width = tx_size_wide[tx_size];
	const int height = tx_size_high[tx_size];
	assert(width <= 32 && height <= 32);

	const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]);
	const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]);
	const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]);
	const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]);
	const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]);
	const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]);
	const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]);

	// Computing 4 cols per iteration (instead of 8) for 8x<h> blocks is faster.
	if (width <= 8) {
	uint8x8_t s0 = vdup_n_u8(above[-1]);
	uint8x8_t s5 = vdup_n_u8(left[0]);
	uint8x8_t s6 = vdup_n_u8(left[1]);

	int c = 0;
	do {
	uint8x8_t s1234 = load_unaligned_u8_4x1(above + c);
	uint8x8_t s1 = vdup_lane_u8(s1234, 0);
	uint8x8_t s2 = vdup_lane_u8(s1234, 1);
	uint8x8_t s3 = vdup_lane_u8(s1234, 2);
	uint8x8_t s4 = vdup_lane_u8(s1234, 3);

	uint8x8_t res = filter_intra_predictor(s0, s1, s2, s3, s4, s5, s6, f0, f1,
	f2, f3, f4, f5, f6);

	store_u8x4_strided_x2(dst + c, stride, res);

	s0 = s4;
	s5 = vdup_lane_u8(res, 3);
	s6 = vdup_lane_u8(res, 7);

	c += 4;
	} while (c < width);

	int r = 2;
	do {
	s0 = vdup_n_u8(left[r - 1]);
	s5 = vdup_n_u8(left[r + 0]);
	s6 = vdup_n_u8(left[r + 1]);

	c = 0;
	do {
	uint8x8_t s1234 = load_u8_4x1(dst + (r - 1) * stride + c);
	uint8x8_t s1 = vdup_lane_u8(s1234, 0);
	uint8x8_t s2 = vdup_lane_u8(s1234, 1);
	uint8x8_t s3 = vdup_lane_u8(s1234, 2);
	uint8x8_t s4 = vdup_lane_u8(s1234, 3);

	uint8x8_t res = filter_intra_predictor(s0, s1, s2, s3, s4, s5, s6, f0,
	f1, f2, f3, f4, f5, f6);

	store_u8x4_strided_x2(dst + r * stride + c, stride, res);

	s0 = s4;
	s5 = vdup_lane_u8(res, 3);
	s6 = vdup_lane_u8(res, 7);

	c += 4;
	} while (c < width);

	r += 2;
	} while (r < height);
	} else {
	uint8x8_t s0_lo = vdup_n_u8(above[-1]);
	uint8x8_t s5_lo = vdup_n_u8(left[0]);
	uint8x8_t s6_lo = vdup_n_u8(left[1]);

	int c = 0;
	do {
	uint8x8_t s1234 = vld1_u8(above + c);
	uint8x8_t s1_lo = vdup_lane_u8(s1234, 0);
	uint8x8_t s2_lo = vdup_lane_u8(s1234, 1);
	uint8x8_t s3_lo = vdup_lane_u8(s1234, 2);
	uint8x8_t s4_lo = vdup_lane_u8(s1234, 3);

	uint8x8_t res_lo =
	filter_intra_predictor(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo,
	s6_lo, f0, f1, f2, f3, f4, f5, f6);

	uint8x8_t s0_hi = s4_lo;
	uint8x8_t s1_hi = vdup_lane_u8(s1234, 4);
	uint8x8_t s2_hi = vdup_lane_u8(s1234, 5);
	uint8x8_t s3_hi = vdup_lane_u8(s1234, 6);
	uint8x8_t s4_hi = vdup_lane_u8(s1234, 7);
	uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3);
	uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7);

	uint8x8_t res_hi =
	filter_intra_predictor(s0_hi, s1_hi, s2_hi, s3_hi, s4_hi, s5_hi,
	s6_hi, f0, f1, f2, f3, f4, f5, f6);

	uint32x2x2_t res =
	vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi));

	vst1_u8(dst + 0 * stride + c, vreinterpret_u8_u32(res.val[0]));
	vst1_u8(dst + 1 * stride + c, vreinterpret_u8_u32(res.val[1]));

	s0_lo = s4_hi;
	s5_lo = vdup_lane_u8(res_hi, 3);
	s6_lo = vdup_lane_u8(res_hi, 7);
	c += 8;
	} while (c < width);

	int r = 2;
	do {
	s0_lo = vdup_n_u8(left[r - 1]);
	s5_lo = vdup_n_u8(left[r + 0]);
	s6_lo = vdup_n_u8(left[r + 1]);

	c = 0;
	do {
	uint8x8_t s1234 = vld1_u8(dst + (r - 1) * stride + c);
	uint8x8_t s1_lo = vdup_lane_u8(s1234, 0);
	uint8x8_t s2_lo = vdup_lane_u8(s1234, 1);
	uint8x8_t s3_lo = vdup_lane_u8(s1234, 2);
	uint8x8_t s4_lo = vdup_lane_u8(s1234, 3);

	uint8x8_t res_lo =
	filter_intra_predictor(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo,
	s6_lo, f0, f1, f2, f3, f4, f5, f6);

	uint8x8_t s0_hi = s4_lo;
	uint8x8_t s1_hi = vdup_lane_u8(s1234, 4);
	uint8x8_t s2_hi = vdup_lane_u8(s1234, 5);
	uint8x8_t s3_hi = vdup_lane_u8(s1234, 6);
	uint8x8_t s4_hi = vdup_lane_u8(s1234, 7);
	uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3);
	uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7);

	uint8x8_t res_hi =
	filter_intra_predictor(s0_hi, s1_hi, s2_hi, s3_hi, s4_hi, s5_hi,
	s6_hi, f0, f1, f2, f3, f4, f5, f6);

	uint32x2x2_t res =
	vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi));

	vst1_u8(dst + (r + 0) * stride + c, vreinterpret_u8_u32(res.val[0]));
	vst1_u8(dst + (r + 1) * stride + c, vreinterpret_u8_u32(res.val[1]));

	s0_lo = s4_hi;
	s5_lo = vdup_lane_u8(res_hi, 3);
	s6_lo = vdup_lane_u8(res_hi, 7);
	c += 8;
	} while (c < width);

	r += 2;
	} while (r < height);
	}
	}

	void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
	if (!strength) return;
	assert(sz >= 0 && sz <= 129);

	uint8_t edge[160]; // Max value of sz + enough padding for vector accesses.
	memcpy(edge + 1, p, sz * sizeof(*p));

	// Populate extra space appropriately.
	edge[0] = edge[1];
	edge[sz + 1] = edge[sz];
	edge[sz + 2] = edge[sz];

	// Don't overwrite first pixel.
	uint8_t *dst = p + 1;
	sz--;

	if (strength == 1) { // Filter: {4, 8, 4}.
	const uint8_t *src = edge + 1;

	while (sz >= 8) {
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + 1);
	uint8x8_t s2 = vld1_u8(src + 2);

	// Make use of the identity:
	// (4a + 8b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
	uint16x8_t t0 = vaddl_u8(s0, s2);
	uint16x8_t t1 = vaddl_u8(s1, s1);
	uint16x8_t sum = vaddq_u16(t0, t1);
	uint8x8_t res = vrshrn_n_u16(sum, 2);

	vst1_u8(dst, res);

	src += 8;
	dst += 8;
	sz -= 8;
	}

	if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + 1);
	uint8x8_t s2 = vld1_u8(src + 2);

	uint16x8_t t0 = vaddl_u8(s0, s2);
	uint16x8_t t1 = vaddl_u8(s1, s1);
	uint16x8_t sum = vaddq_u16(t0, t1);
	uint8x8_t res = vrshrn_n_u16(sum, 2);

	// Mask off out-of-bounds indices.
	uint8x8_t current_dst = vld1_u8(dst);
	uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
	res = vbsl_u8(mask, res, current_dst);

	vst1_u8(dst, res);
	}
	} else if (strength == 2) { // Filter: {5, 6, 5}.
	const uint8_t *src = edge + 1;

	const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } };

	while (sz >= 8) {
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + 1);
	uint8x8_t s2 = vld1_u8(src + 2);

	uint16x8_t accum = vmull_u8(s0, filter.val[0]);
	accum = vmlal_u8(accum, s1, filter.val[1]);
	accum = vmlal_u8(accum, s2, filter.val[2]);
	uint8x8_t res = vrshrn_n_u16(accum, 4);

	vst1_u8(dst, res);

	src += 8;
	dst += 8;
	sz -= 8;
	}

	if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + 1);
	uint8x8_t s2 = vld1_u8(src + 2);

	uint16x8_t accum = vmull_u8(s0, filter.val[0]);
	accum = vmlal_u8(accum, s1, filter.val[1]);
	accum = vmlal_u8(accum, s2, filter.val[2]);
	uint8x8_t res = vrshrn_n_u16(accum, 4);

	// Mask off out-of-bounds indices.
	uint8x8_t current_dst = vld1_u8(dst);
	uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
	res = vbsl_u8(mask, res, current_dst);

	vst1_u8(dst, res);
	}
	} else { // Filter {2, 4, 4, 4, 2}.
	const uint8_t *src = edge;

	while (sz >= 8) {
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + 1);
	uint8x8_t s2 = vld1_u8(src + 2);
	uint8x8_t s3 = vld1_u8(src + 3);
	uint8x8_t s4 = vld1_u8(src + 4);

	// Make use of the identity:
	// (2a + 4b + 4c + 4d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
	uint16x8_t t0 = vaddl_u8(s0, s4);
	uint16x8_t t1 = vaddl_u8(s1, s2);
	t1 = vaddw_u8(t1, s3);
	t1 = vaddq_u16(t1, t1);
	uint16x8_t sum = vaddq_u16(t0, t1);
	uint8x8_t res = vrshrn_n_u16(sum, 3);

	vst1_u8(dst, res);

	src += 8;
	dst += 8;
	sz -= 8;
	}

	if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + 1);
	uint8x8_t s2 = vld1_u8(src + 2);
	uint8x8_t s3 = vld1_u8(src + 3);
	uint8x8_t s4 = vld1_u8(src + 4);

	uint16x8_t t0 = vaddl_u8(s0, s4);
	uint16x8_t t1 = vaddl_u8(s1, s2);
	t1 = vaddw_u8(t1, s3);
	t1 = vaddq_u16(t1, t1);
	uint16x8_t sum = vaddq_u16(t0, t1);
	uint8x8_t res = vrshrn_n_u16(sum, 3);

	// Mask off out-of-bounds indices.
	uint8x8_t current_dst = vld1_u8(dst);
	uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
	res = vbsl_u8(mask, res, current_dst);

	vst1_u8(dst, res);
	}
	}
	}

	void av1_upsample_intra_edge_neon(uint8_t *p, int sz) {
	if (!sz) return;

	assert(sz <= MAX_UPSAMPLE_SZ);

	uint8_t edge[MAX_UPSAMPLE_SZ + 3];
	const uint8_t *src = edge;

	// Copy p[-1..(sz-1)] and pad out both ends.
	edge[0] = p[-1];
	edge[1] = p[-1];
	memcpy(edge + 2, p, sz);
	edge[sz + 2] = p[sz - 1];
	p[-2] = p[-1];

	uint8_t *dst = p - 1;

	do {
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + 1);
	uint8x8_t s2 = vld1_u8(src + 2);
	uint8x8_t s3 = vld1_u8(src + 3);

	int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3));
	int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2));
	t1 = vmulq_n_s16(t1, 9);
	t1 = vsubq_s16(t1, t0);

	uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } };

	vst2_u8(dst, res);

	src += 8;
	dst += 16;
	sz -= 8;
	} while (sz > 0);
	}