Simplify finding final direction in cdef_find_dir_neon
Simplify and optimize the final computation of cdef_find_dir_neon to
find the maximum cost and the associated index in the vector.
Change-Id: Id9821597b49c30689d63fff00ee1c748622faf69
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index b5a8b97..4261592 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -17,6 +17,16 @@
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
+static INLINE int horizontal_add_u8x8(const uint8x8_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlv_u8(a);
+#else
+ uint16x4_t b = vpaddl_u8(a);
+ uint32x2_t c = vpaddl_u16(b);
+ return vget_lane_u32(c, 0) + vget_lane_u32(c, 1);
+#endif
+}
+
static INLINE int horizontal_add_s16x8(const int16x8_t a) {
#if AOM_ARCH_AARCH64
return vaddlvq_s16(a);
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h
index 7f4c165..a509628 100644
--- a/aom_ports/bitops.h
+++ b/aom_ports/bitops.h
@@ -13,6 +13,7 @@
#define AOM_AOM_PORTS_BITOPS_H_
#include <assert.h>
+#include <stdint.h>
#include "aom_ports/msvc.h"
#include "config/aom_config.h"
@@ -52,7 +53,6 @@
_BitScanReverse(&first_set_bit, n);
return first_set_bit;
}
-#undef USE_MSC_INTRINSICS
#else
static INLINE int get_msb(unsigned int n) {
int log = 0;
@@ -71,6 +71,32 @@
}
#endif
+#if defined(__GNUC__) && \
+ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int aom_clzll(uint64_t n) { return __builtin_clzll(n); }
+#elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanReverse64)
+
+static INLINE int aom_clzll(uint64_t n) {
+ int res;
+ _BitScanReverse64(&res, n);
+ return res;
+}
+#undef USE_MSC_INTRINSICS
+#else
+static INLINE int aom_clzll(uint64_t n) {
+ assert(n != 0);
+
+ int res = 0;
+ uint64_t high_bit = 1ULL << 63;
+ while (!(n & high_bit)) {
+ res++;
+ n <<= 1;
+ }
+ return res;
+}
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index a6567fe..f69e9c4 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -16,6 +16,7 @@
#include "config/av1_rtcd.h"
#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
#include "av1/common/cdef_block.h"
void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
@@ -363,19 +364,6 @@
res[0] = vreinterpretq_s16_s64(ziphi_s64(tr1_6, tr1_7));
}
-static INLINE uint32_t compute_best_dir(uint8x16_t a) {
- uint8x16_t idx =
- vandq_u8(a, vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
-#if AOM_ARCH_AARCH64
- return vaddv_u8(vget_low_u8(idx)) + (vaddv_u8(vget_high_u8(idx)) << 8);
-#else
- uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(idx)));
- uint8x16_t s = vreinterpretq_u8_u64(m);
- return vget_lane_u32(
- vreinterpret_u32_u8(vzip_u8(vget_low_u8(s), vget_high_u8(s)).val[0]), 0);
-#endif
-}
-
int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
int coeff_shift) {
uint32_t cost[8];
@@ -396,15 +384,35 @@
// Compute "mostly horizontal" directions.
uint32x4_t cost03 = compute_directions_neon(lines, cost);
- uint32x4_t max_cost = vmaxq_u32(cost03, cost47);
- max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 2));
- max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 1));
- best_cost = vgetq_lane_u32(max_cost, 0);
- uint16x8_t idx = vcombine_u16(vqmovn_u32(vceqq_u32(max_cost, cost03)),
- vqmovn_u32(vceqq_u32(max_cost, cost47)));
- uint8x16_t idx_u8 = vcombine_u8(vqmovn_u16(idx), vqmovn_u16(idx));
- best_dir = compute_best_dir(idx_u8);
- best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros
+ // Find max cost as well as its index to get best_dir.
+ // The max cost needs to be propagated in the whole vector to find its
+ // position in the original cost vectors cost03 and cost47.
+ uint32x4_t cost07 = vmaxq_u32(cost03, cost47);
+#if AOM_ARCH_AARCH64
+ best_cost = vmaxvq_u32(cost07);
+ uint32x4_t max_cost = vdupq_n_u32(best_cost);
+ uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)),
+ vreinterpretq_u8_u32(
+ vceqq_u32(max_cost, cost47)) } };
+ // idx = { 28, 24, 20, 16, 12, 8, 4, 0 };
+ uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL));
+ // Get the lowest 8 bit of each 32-bit elements and reverse them.
+ uint8x8_t tbl = vqtbl2_u8(costs, idx);
+ uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0);
+ best_dir = aom_clzll(a) >> 3;
+#else
+ uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07));
+ cost64 = vpmax_u32(cost64, cost64);
+ uint32x4_t max_cost = vcombine_u32(cost64, cost64);
+ best_cost = vget_lane_u32(cost64, 0);
+ uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)),
+ vmovn_u32(vceqq_u32(max_cost, cost47)));
+ uint8x8_t idx =
+ vand_u8(vmovn_u16(costs),
+ vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL)));
+ int sum = horizontal_add_u8x8(idx);
+ best_dir = get_msb(sum ^ (sum - 1));
+#endif
// Difference between the optimal variance and the variance along the
// orthogonal direction. Again, the sum(x^2) terms cancel out.