av1/common/clpf_simd.h - avm - Git at Google

 /*
  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include "./aom_dsp_rtcd.h"

 static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
                        int dstride, int x0, int y0, int sizey, int width,
                        int height, unsigned int strength) {
   dst += x0 + y0 * dstride;
   src += x0 + y0 * sstride;
   {
     int bottom = height - 2 - y0;
     const v128 sp = v128_dup_8(strength);
     const v128 sm = v128_dup_8(-(int)strength);
     const v128 c8 = v128_dup_8(8);
     const v128 c128 = v128_dup_8(128);

     if (!x0) {  // Clip left
       const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
                                          v64_from_64(0x0504030201000000LL));
       const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
                                          v64_from_64(0x0605040302010000LL));
       int y;

       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
         const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
             v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_shuffle_8(x, b_shuff);
         const v128 c = v128_shuffle_8(x, c_shuff);
         const v128 d = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 1),
                                 v64_load_unaligned(src + 1 + sstride)));
         const v128 e = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 2),
                                 v64_load_unaligned(src + 2 + sstride)));
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
                                         src + ((y != bottom) + 1) * sstride)));

         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
                        v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
         const v128 delta = v128_add_8(
             v128_add_8(
                 v128_shl_8(
                     v128_add_8(
                         v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
                         v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
                     2),
                 v128_add_8(
                     v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
                     v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
             v128_add_8(v128_add_8(tmp, tmp), tmp));
         o = v128_add_8(
             o, v128_shr_s8(
                    v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
         v64_store_aligned(dst + dstride, v128_low_v64(o));
         src += sstride * 2;
         dst += dstride * 2;
       }
     } else if (!(width - x0 - 8)) {  // Clip right
       const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
                                          v64_from_64(0x0707060504030201LL));
       const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
                                          v64_from_64(0x0707070605040302LL));
       int y;

       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
         const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
             v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 2),
                                 v64_load_unaligned(src - 2 + sstride)));
         const v128 c = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 1),
                                 v64_load_unaligned(src - 1 + sstride)));
         const v128 d = v128_shuffle_8(x, d_shuff);
         const v128 e = v128_shuffle_8(x, e_shuff);
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
                                         src + ((y != bottom) + 1) * sstride)));

         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
                        v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
         const v128 delta = v128_add_8(
             v128_add_8(
                 v128_shl_8(
                     v128_add_8(
                         v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
                         v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
                     2),
                 v128_add_8(
                     v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
                     v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
             v128_add_8(v128_add_8(tmp, tmp), tmp));
         o = v128_add_8(
             o, v128_shr_s8(
                    v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
         v64_store_aligned(dst + dstride, v128_low_v64(o));
         src += sstride * 2;
         dst += dstride * 2;
       }
     } else {  // No left/right clipping
       int y;
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
         const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
             v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 2),
                                 v64_load_unaligned(src - 2 + sstride)));
         const v128 c = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 1),
                                 v64_load_unaligned(src - 1 + sstride)));
         const v128 d = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 1),
                                 v64_load_unaligned(src + 1 + sstride)));
         const v128 e = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 2),
                                 v64_load_unaligned(src + 2 + sstride)));
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
                                         src + ((y != bottom) + 1) * sstride)));

         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
                        v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
         const v128 delta = v128_add_8(
             v128_add_8(
                 v128_shl_8(
                     v128_add_8(
                         v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
                         v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
                     2),
                 v128_add_8(
                     v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
                     v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
             v128_add_8(v128_add_8(tmp, tmp), tmp));
         o = v128_add_8(
             o, v128_shr_s8(
                    v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
         v64_store_aligned(dst + dstride, v128_low_v64(o));
         src += sstride * 2;
         dst += dstride * 2;
       }
     }
   }
 }

 void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
                                int dstride, int x0, int y0, int sizex,
                                int sizey, int width, int height,
                                unsigned int strength) {
   // TODO(stemidts):
   // A sizex different from 8 will only be needed if CLPF is extended to chroma.
   // This will only be used if 4:2:0 and width not a multiple of 16 and along
   // the right edge only, so we can fall back to the plain C implementation in
   // this case.  If not extended to chroma, this test will be redundant.
   if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
     // Fallback to C for odd sizes
     aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
                      height, strength);
   } else {
     clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
                strength);
   }
 }
	/*
	* Copyright (c) 2016, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include "./aom_dsp_rtcd.h"

	static void clpf_block(const uint8_t src, uint8_t dst, int sstride,
	int dstride, int x0, int y0, int sizey, int width,
	int height, unsigned int strength) {
	dst += x0 + y0 * dstride;
	src += x0 + y0 * sstride;
	{
	int bottom = height - 2 - y0;
	const v128 sp = v128_dup_8(strength);
	const v128 sm = v128_dup_8(-(int)strength);
	const v128 c8 = v128_dup_8(8);
	const v128 c128 = v128_dup_8(128);

	if (!x0) { // Clip left
	const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
	v64_from_64(0x0504030201000000LL));
	const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
	v64_from_64(0x0605040302010000LL));
	int y;

	for (y = 0; y < sizey; y += 2) {
	const v64 l1 = v64_load_aligned(src);
	const v64 l2 = v64_load_aligned(src + sstride);
	v128 o = v128_from_v64(l1, l2);
	const v128 x = v128_add_8(c128, o);
	const v128 a = v128_add_8(
	c128,
	v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
	const v128 b = v128_shuffle_8(x, b_shuff);
	const v128 c = v128_shuffle_8(x, c_shuff);
	const v128 d = v128_add_8(
	c128, v128_from_v64(v64_load_unaligned(src + 1),
	v64_load_unaligned(src + 1 + sstride)));
	const v128 e = v128_add_8(
	c128, v128_from_v64(v64_load_unaligned(src + 2),
	v64_load_unaligned(src + 2 + sstride)));
	const v128 f = v128_add_8(
	c128, v128_from_v64(l2, v64_load_aligned(
	src + ((y != bottom) + 1) * sstride)));

	const v128 tmp =
	v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
	const v128 delta = v128_add_8(
	v128_add_8(
	v128_shl_8(
	v128_add_8(
	v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
	2),
	v128_add_8(
	v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
	v128_add_8(v128_add_8(tmp, tmp), tmp));
	o = v128_add_8(
	o, v128_shr_s8(
	v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
	delta, v128_zero()))),
	4));
	v64_store_aligned(dst, v128_high_v64(o));
	v64_store_aligned(dst + dstride, v128_low_v64(o));
	src += sstride * 2;
	dst += dstride * 2;
	}
	} else if (!(width - x0 - 8)) { // Clip right
	const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
	v64_from_64(0x0707060504030201LL));
	const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
	v64_from_64(0x0707070605040302LL));
	int y;

	for (y = 0; y < sizey; y += 2) {
	const v64 l1 = v64_load_aligned(src);
	const v64 l2 = v64_load_aligned(src + sstride);
	v128 o = v128_from_v64(l1, l2);
	const v128 x = v128_add_8(c128, o);
	const v128 a = v128_add_8(
	c128,
	v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
	const v128 b = v128_add_8(
	c128, v128_from_v64(v64_load_unaligned(src - 2),
	v64_load_unaligned(src - 2 + sstride)));
	const v128 c = v128_add_8(
	c128, v128_from_v64(v64_load_unaligned(src - 1),
	v64_load_unaligned(src - 1 + sstride)));
	const v128 d = v128_shuffle_8(x, d_shuff);
	const v128 e = v128_shuffle_8(x, e_shuff);
	const v128 f = v128_add_8(
	c128, v128_from_v64(l2, v64_load_aligned(
	src + ((y != bottom) + 1) * sstride)));

	const v128 tmp =
	v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
	const v128 delta = v128_add_8(
	v128_add_8(
	v128_shl_8(
	v128_add_8(
	v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
	2),
	v128_add_8(
	v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
	v128_add_8(v128_add_8(tmp, tmp), tmp));
	o = v128_add_8(
	o, v128_shr_s8(
	v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
	delta, v128_zero()))),
	4));
	v64_store_aligned(dst, v128_high_v64(o));
	v64_store_aligned(dst + dstride, v128_low_v64(o));
	src += sstride * 2;
	dst += dstride * 2;
	}
	} else { // No left/right clipping
	int y;
	for (y = 0; y < sizey; y += 2) {
	const v64 l1 = v64_load_aligned(src);
	const v64 l2 = v64_load_aligned(src + sstride);
	v128 o = v128_from_v64(l1, l2);
	const v128 x = v128_add_8(c128, o);
	const v128 a = v128_add_8(
	c128,
	v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
	const v128 b = v128_add_8(
	c128, v128_from_v64(v64_load_unaligned(src - 2),
	v64_load_unaligned(src - 2 + sstride)));
	const v128 c = v128_add_8(
	c128, v128_from_v64(v64_load_unaligned(src - 1),
	v64_load_unaligned(src - 1 + sstride)));
	const v128 d = v128_add_8(
	c128, v128_from_v64(v64_load_unaligned(src + 1),
	v64_load_unaligned(src + 1 + sstride)));
	const v128 e = v128_add_8(
	c128, v128_from_v64(v64_load_unaligned(src + 2),
	v64_load_unaligned(src + 2 + sstride)));
	const v128 f = v128_add_8(
	c128, v128_from_v64(l2, v64_load_aligned(
	src + ((y != bottom) + 1) * sstride)));

	const v128 tmp =
	v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
	const v128 delta = v128_add_8(
	v128_add_8(
	v128_shl_8(
	v128_add_8(
	v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
	2),
	v128_add_8(
	v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
	v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
	v128_add_8(v128_add_8(tmp, tmp), tmp));
	o = v128_add_8(
	o, v128_shr_s8(
	v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
	delta, v128_zero()))),
	4));
	v64_store_aligned(dst, v128_high_v64(o));
	v64_store_aligned(dst + dstride, v128_low_v64(o));
	src += sstride * 2;
	dst += dstride * 2;
	}
	}
	}
	}

	void SIMD_FUNC(aom_clpf_block)(const uint8_t src, uint8_t dst, int sstride,
	int dstride, int x0, int y0, int sizex,
	int sizey, int width, int height,
	unsigned int strength) {
	// TODO(stemidts):
	// A sizex different from 8 will only be needed if CLPF is extended to chroma.
	// This will only be used if 4:2:0 and width not a multiple of 16 and along
	// the right edge only, so we can fall back to the plain C implementation in
	// this case. If not extended to chroma, this test will be redundant.
	if (sizex != 8 \|\| width < 16 \|\| y0 + 8 > height \|\| x0 + 8 > width) {
	// Fallback to C for odd sizes
	aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
	height, strength);
	} else {
	clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
	strength);
	}
	}