| /* |
| * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include "aom_dsp/aom_simd.h" |
| |
| SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, |
| v128 f, v128 sp, v128 sm) { |
| const v128 tmp = |
| v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), |
| v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); |
| v128 delta = v128_add_8( |
| v128_add_8( |
| v128_shl_8( |
| v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), |
| v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), |
| 2), |
| v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), |
| v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), |
| v128_add_8(v128_add_8(tmp, tmp), tmp)); |
| |
| return v128_shr_s8( |
| v128_add_8(v128_dup_8(8), |
| v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), |
| 4); |
| } |
| |
| void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, |
| int rstride, int ostride, int x0, int y0, |
| int width, int height, int *sum0, int *sum1, |
| unsigned int strength) { |
| ssd128_internal ssd0 = v128_ssd_u8_init(); |
| ssd128_internal ssd1 = v128_ssd_u8_init(); |
| const v128 c128 = v128_dup_8(128); |
| const v128 sp = v128_dup_8(strength); |
| const v128 sm = v128_dup_8(-(int)strength); |
| const int bottom = height - 2 - y0; |
| |
| rec += x0 + y0 * rstride; |
| org += x0 + y0 * ostride; |
| |
| if (!x0) { // Clip left |
| const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), |
| v64_from_64(0x0504030201000000LL)); |
| const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), |
| v64_from_64(0x0605040302010000LL)); |
| int y; |
| |
| for (y = 0; y < 8; y += 2) { |
| const v64 k1 = v64_load_aligned(org); |
| const v64 k2 = v64_load_aligned(org + ostride); |
| const v64 l1 = v64_load_aligned(rec); |
| const v64 l2 = v64_load_aligned(rec + rstride); |
| v128 o = v128_from_v64(k1, k2); |
| const v128 q = v128_from_v64(l1, l2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, |
| v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); |
| const v128 b = v128_shuffle_8(x, b_shuff); |
| const v128 c = v128_shuffle_8(x, c_shuff); |
| const v128 d = v128_add_8( |
| c128, v128_from_v64(v64_load_unaligned(rec + 1), |
| v64_load_unaligned(rec + 1 + rstride))); |
| const v128 e = v128_add_8( |
| c128, v128_from_v64(v64_load_unaligned(rec + 2), |
| v64_load_unaligned(rec + 2 + rstride))); |
| const v128 f = v128_add_8( |
| c128, v128_from_v64( |
| l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| ssd1 = v128_ssd_u8( |
| ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); |
| rec += rstride * 2; |
| org += ostride * 2; |
| } |
| } else if (!(width - x0 - 8)) { // Clip right |
| const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), |
| v64_from_64(0x0707060504030201LL)); |
| const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), |
| v64_from_64(0x0707070605040302LL)); |
| int y; |
| |
| for (y = 0; y < 8; y += 2) { |
| const v64 k1 = v64_load_aligned(org); |
| const v64 k2 = v64_load_aligned(org + ostride); |
| const v64 l1 = v64_load_aligned(rec); |
| const v64 l2 = v64_load_aligned(rec + rstride); |
| v128 o = v128_from_v64(k1, k2); |
| const v128 q = v128_from_v64(l1, l2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, |
| v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); |
| const v128 b = v128_add_8( |
| c128, v128_from_v64(v64_load_unaligned(rec - 2), |
| v64_load_unaligned(rec - 2 + rstride))); |
| const v128 c = v128_add_8( |
| c128, v128_from_v64(v64_load_unaligned(rec - 1), |
| v64_load_unaligned(rec - 1 + rstride))); |
| const v128 d = v128_shuffle_8(x, d_shuff); |
| const v128 e = v128_shuffle_8(x, e_shuff); |
| const v128 f = v128_add_8( |
| c128, v128_from_v64( |
| l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| ssd1 = v128_ssd_u8( |
| ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); |
| rec += rstride * 2; |
| org += ostride * 2; |
| } |
| } else { // No left/right clipping |
| int y; |
| for (y = 0; y < 8; y += 2) { |
| const v64 k1 = v64_load_aligned(org); |
| const v64 k2 = v64_load_aligned(org + ostride); |
| const v64 l1 = v64_load_aligned(rec); |
| const v64 l2 = v64_load_aligned(rec + rstride); |
| v128 o = v128_from_v64(k1, k2); |
| const v128 q = v128_from_v64(l1, l2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, |
| v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); |
| const v128 b = v128_add_8( |
| c128, v128_from_v64(v64_load_unaligned(rec - 2), |
| v64_load_unaligned(rec - 2 + rstride))); |
| const v128 c = v128_add_8( |
| c128, v128_from_v64(v64_load_unaligned(rec - 1), |
| v64_load_unaligned(rec - 1 + rstride))); |
| const v128 d = v128_add_8( |
| c128, v128_from_v64(v64_load_unaligned(rec + 1), |
| v64_load_unaligned(rec + 1 + rstride))); |
| const v128 e = v128_add_8( |
| c128, v128_from_v64(v64_load_unaligned(rec + 2), |
| v64_load_unaligned(rec + 2 + rstride))); |
| const v128 f = v128_add_8( |
| c128, v128_from_v64( |
| l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| ssd1 = v128_ssd_u8( |
| ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); |
| rec += rstride * 2; |
| org += ostride * 2; |
| } |
| } |
| *sum0 += v128_ssd_u8_sum(ssd0); |
| *sum1 += v128_ssd_u8_sum(ssd1); |
| } |
| |
| SIMD_INLINE void calc_delta_multi(v128 x, v128 q, v128 o, v128 a, v128 b, |
| v128 c, v128 d, v128 e, v128 f, v128 cp1, |
| v128 cm1, v128 cp2, v128 cm2, v128 cp4, |
| v128 cm4, ssd128_internal *ssd1, |
| ssd128_internal *ssd2, |
| ssd128_internal *ssd3) { |
| v128 tmp, delta1, delta2, delta3; |
| const v128 c8 = v128_dup_8(8); |
| |
| a = v128_ssub_s8(a, x); |
| b = v128_ssub_s8(b, x); |
| c = v128_ssub_s8(c, x); |
| d = v128_ssub_s8(d, x); |
| e = v128_ssub_s8(e, x); |
| f = v128_ssub_s8(f, x); |
| tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), |
| v128_max_s8(v128_min_s8(d, cp1), cm1)); |
| delta1 = v128_add_8( |
| v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), |
| v128_max_s8(v128_min_s8(f, cp1), cm1)), |
| 2), |
| v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), |
| v128_max_s8(v128_min_s8(e, cp1), cm1))), |
| v128_add_8(v128_add_8(tmp, tmp), tmp)); |
| tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), |
| v128_max_s8(v128_min_s8(d, cp2), cm2)); |
| delta2 = v128_add_8( |
| v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), |
| v128_max_s8(v128_min_s8(f, cp2), cm2)), |
| 2), |
| v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), |
| v128_max_s8(v128_min_s8(e, cp2), cm2))), |
| v128_add_8(v128_add_8(tmp, tmp), tmp)); |
| tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), |
| v128_max_s8(v128_min_s8(d, cp4), cm4)); |
| delta3 = v128_add_8( |
| v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), |
| v128_max_s8(v128_min_s8(f, cp4), cm4)), |
| 2), |
| v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), |
| v128_max_s8(v128_min_s8(e, cp4), cm4))), |
| v128_add_8(v128_add_8(tmp, tmp), tmp)); |
| |
| *ssd1 = v128_ssd_u8( |
| *ssd1, o, |
| v128_add_8( |
| q, v128_shr_s8( |
| v128_add_8(c8, v128_add_8(delta1, |
| v128_cmplt_s8(delta1, v128_zero()))), |
| 4))); |
| *ssd2 = v128_ssd_u8( |
| *ssd2, o, |
| v128_add_8( |
| q, v128_shr_s8( |
| v128_add_8(c8, v128_add_8(delta2, |
| v128_cmplt_s8(delta2, v128_zero()))), |
| 4))); |
| *ssd3 = v128_ssd_u8( |
| *ssd3, o, |
| v128_add_8( |
| q, v128_shr_s8( |
| v128_add_8(c8, v128_add_8(delta3, |
| v128_cmplt_s8(delta3, v128_zero()))), |
| 4))); |
| } |
| |
| // Test multiple filter strengths at once. |
| void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, |
| int rstride, int ostride, int x0, int y0, |
| int width, int height, int *sum) { |
| const v128 c128 = v128_dup_8(128); |
| const v128 cp1 = v128_dup_8(1); |
| const v128 cm1 = v128_dup_8(-1); |
| const v128 cp2 = v128_dup_8(2); |
| const v128 cm2 = v128_dup_8(-2); |
| const v128 cp4 = v128_dup_8(4); |
| const v128 cm4 = v128_dup_8(-4); |
| const int bottom = height - 2 - y0; |
| ssd128_internal ssd0 = v128_ssd_u8_init(); |
| ssd128_internal ssd1 = v128_ssd_u8_init(); |
| ssd128_internal ssd2 = v128_ssd_u8_init(); |
| ssd128_internal ssd3 = v128_ssd_u8_init(); |
| |
| rec += x0 + y0 * rstride; |
| org += x0 + y0 * ostride; |
| |
| if (!x0) { // Clip left |
| const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), |
| v64_from_64(0x0504030201000000LL)); |
| const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), |
| v64_from_64(0x0605040302010000LL)); |
| int y; |
| |
| for (y = 0; y < 8; y += 2) { |
| const v64 k1 = v64_load_aligned(org); |
| const v64 k2 = v64_load_aligned(org + ostride); |
| const v64 l1 = v64_load_aligned(rec); |
| const v64 l2 = v64_load_aligned(rec + rstride); |
| v128 o = v128_from_v64(k1, k2); |
| const v128 q = v128_from_v64(l1, l2); |
| const v128 x = v128_add_8(c128, q); |
| v128 a = v128_add_8( |
| c128, |
| v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); |
| v128 b = v128_shuffle_8(x, b_shuff); |
| v128 c = v128_shuffle_8(x, c_shuff); |
| v128 d = v128_add_8(c128, |
| v128_from_v64(v64_load_unaligned(rec + 1), |
| v64_load_unaligned(rec + 1 + rstride))); |
| v128 e = v128_add_8(c128, |
| v128_from_v64(v64_load_unaligned(rec + 2), |
| v64_load_unaligned(rec + 2 + rstride))); |
| v128 f = v128_add_8( |
| c128, v128_from_v64( |
| l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, |
| &ssd1, &ssd2, &ssd3); |
| rec += 2 * rstride; |
| org += 2 * ostride; |
| } |
| } else if (!(width - x0 - 8)) { // Clip right |
| const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), |
| v64_from_64(0x0707060504030201LL)); |
| const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), |
| v64_from_64(0x0707070605040302LL)); |
| int y; |
| |
| for (y = 0; y < 8; y += 2) { |
| const v64 k1 = v64_load_aligned(org); |
| const v64 k2 = v64_load_aligned(org + ostride); |
| const v64 l1 = v64_load_aligned(rec); |
| const v64 l2 = v64_load_aligned(rec + rstride); |
| v128 o = v128_from_v64(k1, k2); |
| const v128 q = v128_from_v64(l1, l2); |
| const v128 x = v128_add_8(c128, q); |
| v128 a = v128_add_8( |
| c128, |
| v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); |
| v128 b = v128_add_8(c128, |
| v128_from_v64(v64_load_unaligned(rec - 2), |
| v64_load_unaligned(rec - 2 + rstride))); |
| v128 c = v128_add_8(c128, |
| v128_from_v64(v64_load_unaligned(rec - 1), |
| v64_load_unaligned(rec - 1 + rstride))); |
| v128 d = v128_shuffle_8(x, d_shuff); |
| v128 e = v128_shuffle_8(x, e_shuff); |
| v128 f = v128_add_8( |
| c128, v128_from_v64( |
| l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, |
| &ssd1, &ssd2, &ssd3); |
| rec += 2 * rstride; |
| org += 2 * ostride; |
| } |
| } else { // No left/right clipping |
| int y; |
| for (y = 0; y < 8; y += 2) { |
| const v64 k1 = v64_load_aligned(org); |
| const v64 k2 = v64_load_aligned(org + ostride); |
| const v64 l1 = v64_load_aligned(rec); |
| const v64 l2 = v64_load_aligned(rec + rstride); |
| v128 o = v128_from_v64(k1, k2); |
| const v128 q = v128_from_v64(l1, l2); |
| const v128 x = v128_add_8(c128, q); |
| v128 a = v128_add_8( |
| c128, |
| v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); |
| v128 b = v128_add_8(c128, |
| v128_from_v64(v64_load_unaligned(rec - 2), |
| v64_load_unaligned(rec - 2 + rstride))); |
| v128 c = v128_add_8(c128, |
| v128_from_v64(v64_load_unaligned(rec - 1), |
| v64_load_unaligned(rec - 1 + rstride))); |
| v128 d = v128_add_8(c128, |
| v128_from_v64(v64_load_unaligned(rec + 1), |
| v64_load_unaligned(rec + 1 + rstride))); |
| v128 e = v128_add_8(c128, |
| v128_from_v64(v64_load_unaligned(rec + 2), |
| v64_load_unaligned(rec + 2 + rstride))); |
| v128 f = v128_add_8( |
| c128, v128_from_v64( |
| l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, |
| &ssd1, &ssd2, &ssd3); |
| rec += 2 * rstride; |
| org += 2 * ostride; |
| } |
| } |
| sum[0] += v128_ssd_u8_sum(ssd0); |
| sum[1] += v128_ssd_u8_sum(ssd1); |
| sum[2] += v128_ssd_u8_sum(ssd2); |
| sum[3] += v128_ssd_u8_sum(ssd3); |
| } |
| |
| #if CONFIG_AOM_HIGHBITDEPTH |
| void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org, |
| int rstride, int ostride, int x0, int y0, |
| int width, int height, int *sum0, int *sum1, |
| unsigned int strength, int shift) { |
| ssd128_internal ssd0 = v128_ssd_u8_init(); |
| ssd128_internal ssd1 = v128_ssd_u8_init(); |
| const v128 c128 = v128_dup_8(128); |
| const v128 sp = v128_dup_8(strength >> shift); |
| const v128 sm = v128_dup_8(-(int)(strength >> shift)); |
| const int bottom = height - 2 - y0; |
| |
| rec += x0 + y0 * rstride; |
| org += x0 + y0 * ostride; |
| |
| if (!x0) { // Clip left |
| const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), |
| v64_from_64(0x0504030201000000LL)); |
| const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), |
| v64_from_64(0x0605040302010000LL)); |
| int y; |
| |
| for (y = 0; y < 8; y += 2) { |
| const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); |
| const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); |
| const v128 o = |
| v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), |
| v128_shr_u16(v128_load_aligned(org + ostride), shift)); |
| const v128 q = v128_unziplo_8(n1, n2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, v128_unziplo_8( |
| v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), |
| shift), |
| n1)); |
| const v128 b = v128_shuffle_8(x, b_shuff); |
| const v128 c = v128_shuffle_8(x, c_shuff); |
| const v128 d = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec + 1), shift), |
| v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); |
| const v128 e = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec + 2), shift), |
| v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); |
| const v128 f = v128_add_8( |
| c128, v128_unziplo_8( |
| n2, v128_shr_u16(v128_load_unaligned( |
| rec + ((y != bottom) + 1) * rstride), |
| shift))); |
| |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| ssd1 = v128_ssd_u8( |
| ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); |
| rec += rstride * 2; |
| org += ostride * 2; |
| } |
| } else if (!(width - x0 - 8)) { // Clip right |
| const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), |
| v64_from_64(0x0707060504030201LL)); |
| const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), |
| v64_from_64(0x0707070605040302LL)); |
| int y; |
| |
| for (y = 0; y < 8; y += 2) { |
| const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); |
| const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); |
| const v128 o = |
| v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), |
| v128_shr_u16(v128_load_aligned(org + ostride), shift)); |
| const v128 q = v128_unziplo_8(n1, n2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, v128_unziplo_8( |
| v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), |
| shift), |
| n1)); |
| const v128 b = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec - 2), shift), |
| v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); |
| const v128 c = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec - 1), shift), |
| v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); |
| const v128 d = v128_shuffle_8(x, d_shuff); |
| const v128 e = v128_shuffle_8(x, e_shuff); |
| const v128 f = v128_add_8( |
| c128, v128_unziplo_8( |
| n2, v128_shr_u16(v128_load_unaligned( |
| rec + ((y != bottom) + 1) * rstride), |
| shift))); |
| |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| ssd1 = v128_ssd_u8( |
| ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); |
| rec += rstride * 2; |
| org += ostride * 2; |
| } |
| } else { // No left/right clipping |
| int y; |
| for (y = 0; y < 8; y += 2) { |
| const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); |
| const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); |
| const v128 o = |
| v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), |
| v128_shr_u16(v128_load_aligned(org + ostride), shift)); |
| const v128 q = v128_unziplo_8(n1, n2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, v128_unziplo_8( |
| v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), |
| shift), |
| n1)); |
| const v128 b = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec - 2), shift), |
| v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); |
| const v128 c = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec - 1), shift), |
| v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); |
| const v128 d = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec + 1), shift), |
| v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); |
| const v128 e = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec + 2), shift), |
| v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); |
| const v128 f = v128_add_8( |
| c128, v128_unziplo_8( |
| n2, v128_shr_u16(v128_load_unaligned( |
| rec + ((y != bottom) + 1) * rstride), |
| shift))); |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| ssd1 = v128_ssd_u8( |
| ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); |
| rec += rstride * 2; |
| org += ostride * 2; |
| } |
| } |
| *sum0 += v128_ssd_u8_sum(ssd0); |
| *sum1 += v128_ssd_u8_sum(ssd1); |
| } |
| |
| void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec, |
| const uint16_t *org, int rstride, |
| int ostride, int x0, int y0, |
| int width, int height, int *sum, |
| int shift) { |
| const v128 c128 = v128_dup_8(128); |
| const v128 cp1 = v128_dup_8(1); |
| const v128 cm1 = v128_dup_8(-1); |
| const v128 cp2 = v128_dup_8(2); |
| const v128 cm2 = v128_dup_8(-2); |
| const v128 cp4 = v128_dup_8(4); |
| const v128 cm4 = v128_dup_8(-4); |
| const int bottom = height - 2 - y0; |
| ssd128_internal ssd0 = v128_ssd_u8_init(); |
| ssd128_internal ssd1 = v128_ssd_u8_init(); |
| ssd128_internal ssd2 = v128_ssd_u8_init(); |
| ssd128_internal ssd3 = v128_ssd_u8_init(); |
| |
| rec += x0 + y0 * rstride; |
| org += x0 + y0 * ostride; |
| |
| if (!x0) { // Clip left |
| const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), |
| v64_from_64(0x0504030201000000LL)); |
| const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), |
| v64_from_64(0x0605040302010000LL)); |
| int y; |
| |
| for (y = 0; y < 8; y += 2) { |
| const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); |
| const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); |
| const v128 o = |
| v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), |
| v128_shr_u16(v128_load_aligned(org + ostride), shift)); |
| const v128 q = v128_unziplo_8(n1, n2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, v128_unziplo_8( |
| v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), |
| shift), |
| n1)); |
| const v128 b = v128_shuffle_8(x, b_shuff); |
| const v128 c = v128_shuffle_8(x, c_shuff); |
| const v128 d = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec + 1), shift), |
| v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); |
| const v128 e = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec + 2), shift), |
| v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); |
| const v128 f = v128_add_8( |
| c128, v128_unziplo_8( |
| n2, v128_shr_u16(v128_load_unaligned( |
| rec + ((y != bottom) + 1) * rstride), |
| shift))); |
| |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, |
| &ssd1, &ssd2, &ssd3); |
| rec += 2 * rstride; |
| org += 2 * ostride; |
| } |
| } else if (!(width - x0 - 8)) { // Clip right |
| const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), |
| v64_from_64(0x0707060504030201LL)); |
| const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), |
| v64_from_64(0x0707070605040302LL)); |
| int y; |
| |
| for (y = 0; y < 8; y += 2) { |
| const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); |
| const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); |
| const v128 o = |
| v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), |
| v128_shr_u16(v128_load_aligned(org + ostride), shift)); |
| const v128 q = v128_unziplo_8(n1, n2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, v128_unziplo_8( |
| v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), |
| shift), |
| n1)); |
| const v128 b = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec - 2), shift), |
| v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); |
| const v128 c = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec - 1), shift), |
| v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); |
| const v128 d = v128_shuffle_8(x, d_shuff); |
| const v128 e = v128_shuffle_8(x, e_shuff); |
| const v128 f = v128_add_8( |
| c128, v128_unziplo_8( |
| n2, v128_shr_u16(v128_load_unaligned( |
| rec + ((y != bottom) + 1) * rstride), |
| shift))); |
| |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, |
| &ssd1, &ssd2, &ssd3); |
| rec += 2 * rstride; |
| org += 2 * ostride; |
| } |
| } else { // No left/right clipping |
| int y; |
| for (y = 0; y < 8; y += 2) { |
| const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); |
| const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); |
| const v128 o = |
| v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), |
| v128_shr_u16(v128_load_aligned(org + ostride), shift)); |
| const v128 q = v128_unziplo_8(n1, n2); |
| const v128 x = v128_add_8(c128, q); |
| const v128 a = v128_add_8( |
| c128, v128_unziplo_8( |
| v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), |
| shift), |
| n1)); |
| const v128 b = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec - 2), shift), |
| v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); |
| const v128 c = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec - 1), shift), |
| v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); |
| const v128 d = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec + 1), shift), |
| v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); |
| const v128 e = v128_add_8( |
| c128, |
| v128_unziplo_8( |
| v128_shr_u16(v128_load_unaligned(rec + 2), shift), |
| v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); |
| const v128 f = v128_add_8( |
| c128, v128_unziplo_8( |
| n2, v128_shr_u16(v128_load_unaligned( |
| rec + ((y != bottom) + 1) * rstride), |
| shift))); |
| |
| ssd0 = v128_ssd_u8(ssd0, o, q); |
| calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, |
| &ssd1, &ssd2, &ssd3); |
| rec += 2 * rstride; |
| org += 2 * ostride; |
| } |
| } |
| sum[0] += v128_ssd_u8_sum(ssd0); |
| sum[1] += v128_ssd_u8_sum(ssd1); |
| sum[2] += v128_ssd_u8_sum(ssd2); |
| sum[3] += v128_ssd_u8_sum(ssd3); |
| } |
| #endif |