|  | /* | 
|  | *  Copyright 2012 The LibYuv Project Authors. All rights reserved. | 
|  | * | 
|  | *  Use of this source code is governed by a BSD-style license | 
|  | *  that can be found in the LICENSE file in the root of the source | 
|  | *  tree. An additional intellectual property rights grant can be found | 
|  | *  in the file PATENTS. All contributing project authors may | 
|  | *  be found in the AUTHORS file in the root of the source tree. | 
|  | */ | 
|  |  | 
|  | #include "libyuv/basic_types.h" | 
|  |  | 
|  | #include "libyuv/compare_row.h" | 
|  | #include "libyuv/row.h" | 
|  |  | 
|  | #ifdef __cplusplus | 
|  | namespace libyuv { | 
|  | extern "C" { | 
|  | #endif | 
|  |  | 
|  | #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ | 
|  | !defined(__aarch64__) | 
|  |  | 
|  | // 256 bits at a time | 
|  | // uses short accumulator which restricts count to 131 KB | 
|  | uint32_t HammingDistance_NEON(const uint8_t* src_a, | 
|  | const uint8_t* src_b, | 
|  | int count) { | 
|  | uint32_t diff; | 
|  |  | 
|  | asm volatile( | 
|  | "vmov.u16    q4, #0                        \n"  // accumulator | 
|  |  | 
|  | "1:                                        \n" | 
|  | "vld1.8      {q0, q1}, [%0]!               \n" | 
|  | "vld1.8      {q2, q3}, [%1]!               \n" | 
|  | "veor.32     q0, q0, q2                    \n" | 
|  | "veor.32     q1, q1, q3                    \n" | 
|  | "vcnt.i8     q0, q0                        \n" | 
|  | "vcnt.i8     q1, q1                        \n" | 
|  | "subs        %2, %2, #32                   \n" | 
|  | "vadd.u8     q0, q0, q1                    \n"  // 16 byte counts | 
|  | "vpadal.u8   q4, q0                        \n"  // 8 shorts | 
|  | "bgt         1b                            \n" | 
|  |  | 
|  | "vpaddl.u16  q0, q4                        \n"  // 4 ints | 
|  | "vpadd.u32   d0, d0, d1                    \n" | 
|  | "vpadd.u32   d0, d0, d0                    \n" | 
|  | "vmov.32     %3, d0[0]                     \n" | 
|  |  | 
|  | : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) | 
|  | : | 
|  | : "cc", "q0", "q1", "q2", "q3", "q4"); | 
|  | return diff; | 
|  | } | 
|  |  | 
|  | uint32_t SumSquareError_NEON(const uint8_t* src_a, | 
|  | const uint8_t* src_b, | 
|  | int count) { | 
|  | uint32_t sse; | 
|  | asm volatile( | 
|  | "vmov.u8     q8, #0                        \n" | 
|  | "vmov.u8     q10, #0                       \n" | 
|  | "vmov.u8     q9, #0                        \n" | 
|  | "vmov.u8     q11, #0                       \n" | 
|  |  | 
|  | "1:                                        \n" | 
|  | "vld1.8      {q0}, [%0]!                   \n" | 
|  | "vld1.8      {q1}, [%1]!                   \n" | 
|  | "subs        %2, %2, #16                   \n" | 
|  | "vsubl.u8    q2, d0, d2                    \n" | 
|  | "vsubl.u8    q3, d1, d3                    \n" | 
|  | "vmlal.s16   q8, d4, d4                    \n" | 
|  | "vmlal.s16   q9, d6, d6                    \n" | 
|  | "vmlal.s16   q10, d5, d5                   \n" | 
|  | "vmlal.s16   q11, d7, d7                   \n" | 
|  | "bgt         1b                            \n" | 
|  |  | 
|  | "vadd.u32    q8, q8, q9                    \n" | 
|  | "vadd.u32    q10, q10, q11                 \n" | 
|  | "vadd.u32    q11, q8, q10                  \n" | 
|  | "vpaddl.u32  q1, q11                       \n" | 
|  | "vadd.u64    d0, d2, d3                    \n" | 
|  | "vmov.32     %3, d0[0]                     \n" | 
|  | : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) | 
|  | : | 
|  | : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); | 
|  | return sse; | 
|  | } | 
|  |  | 
|  | #endif  // defined(__ARM_NEON__) && !defined(__aarch64__) | 
|  |  | 
|  | #ifdef __cplusplus | 
|  | }  // extern "C" | 
|  | }  // namespace libyuv | 
|  | #endif |