Merge "Fix inconsistency in gm parameter write to bitstream" into nextgenv2
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 0d0dc94..2adbef1 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -376,4 +376,6 @@
DSP_SRCS-yes += aom_dsp_rtcd.c
DSP_SRCS-yes += aom_dsp_rtcd_defs.pl
+DSP_SRCS-yes += aom_simd.c
+
$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a2b9a75..ba4b40f 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -587,11 +587,19 @@
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CLPF") eq "yes") {
- add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
+ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
+ specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+ add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size";
+ specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
+ add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size";
+ specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
+ }
+ add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
- add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
+ add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size";
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
- add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum";
+ add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size";
specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/;
}
diff --git a/aom_dsp/aom_simd.c b/aom_dsp/aom_simd.c
new file mode 100644
index 0000000..03f4ba9
--- /dev/null
+++ b/aom_dsp/aom_simd.c
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Set to 1 to add some sanity checks in the fallback C code
+const int simd_check = 1;
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index 611949a..d062e07 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -90,8 +90,8 @@
return literal;
}
-static INLINE int aom_read_tree(aom_reader *r, const aom_tree_index *tree,
- const aom_prob *probs) {
+static INLINE int aom_read_tree_bits(aom_reader *r, const aom_tree_index *tree,
+ const aom_prob *probs) {
aom_tree_index i = 0;
while ((i = tree[i + aom_read(r, probs[i >> 1])]) > 0) continue;
@@ -99,6 +99,11 @@
return -i;
}
+static INLINE int aom_read_tree(aom_reader *r, const aom_tree_index *tree,
+ const aom_prob *probs) {
+ return aom_read_tree_bits(r, tree, probs);
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index d3e4ae9..5e34fd6 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -70,6 +70,22 @@
for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
}
+static INLINE void aom_write_tree_bits(aom_writer *w, const aom_tree_index *tr,
+ const aom_prob *probs, int bits, int len,
+ aom_tree_index i) {
+ do {
+ const int bit = (bits >> --len) & 1;
+ aom_write(w, bit, probs[i >> 1]);
+ i = tr[i + bit];
+ } while (len);
+}
+
+static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
+ const aom_prob *probs, int bits, int len,
+ aom_tree_index i) {
+ aom_write_tree_bits(w, tree, probs, bits, len, i);
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index 13d1314..73549b8 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -28,7 +28,7 @@
SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
- return vcombine_s64(b, a);
+ return vcombine_s64((uint64x1_t)b, (uint64x1_t)a);
}
SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
@@ -52,7 +52,9 @@
}
SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
-#if __OPTIMIZE__
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
return c ? vreinterpretq_s64_s8(
vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
: b;
@@ -122,7 +124,7 @@
SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
ssd128_internal s;
- s.hi = s.lo = 0;
+ s.hi = s.lo = (ssd64_internal)(uint64_t)0;
return s;
}
@@ -430,11 +432,11 @@
SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
return v128_from_64(
- vreinterpret_s64_u8(
+ (uint64_t)vreinterpret_s64_u8(
vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
vget_high_u8(vreinterpretq_u8_s64(x)) } },
vreinterpret_u8_s64(vget_high_s64(pattern)))),
- vreinterpret_s64_u8(
+ (uint64_t)vreinterpret_s64_u8(
vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
vget_high_u8(vreinterpretq_u8_s64(x)) } },
vreinterpret_u8_s64(vget_low_s64(pattern)))));
@@ -521,21 +523,24 @@
vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
}
-#if __OPTIMIZE__
+#if __OPTIMIZE__ && !__clang__
SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
return n < 8
? v128_from_64(
- vorr_u64(vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
- n * 8),
- vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
- (8 - n) * 8)),
- vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8))
- : (n == 8 ? v128_from_64(vreinterpret_u64_s64(vget_low_s64(a)), 0)
- : v128_from_64(
- vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
- (n - 8) * 8),
- 0));
+ (uint64_t)vorr_u64(
+ vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+ n * 8),
+ vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+ (8 - n) * 8)),
+ (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+ n * 8))
+ : (n == 8 ? v128_from_64(
+ (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
+ : v128_from_64((uint64_t)vshl_n_u64(
+ vreinterpret_u64_s64(vget_low_s64(a)),
+ (n - 8) * 8),
+ 0));
}
SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
diff --git a/aom_dsp/simd/v128_intrinsics_c.h b/aom_dsp/simd/v128_intrinsics_c.h
index 561ac86..34e312e 100644
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -15,6 +15,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "./v64_intrinsics_c.h"
+#include "./aom_config.h"
typedef union {
uint8_t u8[16];
@@ -406,11 +407,13 @@
}
SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
- return big_endian() ? _c_v128_unzip_8(a, b, 1) : _c_v128_unzip_8(a, b, 0);
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
+ : _c_v128_unzip_8(a, b, 0);
}
SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
- return big_endian() ? _c_v128_unzip_8(b, a, 0) : _c_v128_unzip_8(b, a, 1);
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
+ : _c_v128_unzip_8(b, a, 1);
}
SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
@@ -438,11 +441,13 @@
}
SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
- return big_endian() ? _c_v128_unzip_16(a, b, 1) : _c_v128_unzip_16(a, b, 0);
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
+ : _c_v128_unzip_16(a, b, 0);
}
SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
- return big_endian() ? _c_v128_unzip_16(b, a, 0) : _c_v128_unzip_16(b, a, 1);
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
+ : _c_v128_unzip_16(b, a, 1);
}
SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
@@ -462,11 +467,13 @@
}
SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
- return big_endian() ? _c_v128_unzip_32(a, b, 1) : _c_v128_unzip_32(a, b, 0);
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
+ : _c_v128_unzip_32(a, b, 0);
}
SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
- return big_endian() ? _c_v128_unzip_32(b, a, 0) : _c_v128_unzip_32(b, a, 1);
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
+ : _c_v128_unzip_32(b, a, 1);
}
SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
@@ -535,8 +542,8 @@
c);
abort();
}
- t.u8[c] =
- a.u8[big_endian() ? 15 - (pattern.u8[c] & 15) : pattern.u8[c] & 15];
+ t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
+ : pattern.u8[c] & 15];
}
return t;
}
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index e09cbb9..4504996 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -58,7 +58,9 @@
_mm_storeu_si128((__m128i *)p, a);
}
-#if defined(__OPTIMIZE__)
+// The following function requires an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
#if defined(__SSSE3__)
SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
return c ? _mm_alignr_epi8(a, b, c) : b;
@@ -418,26 +420,19 @@
SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
- __m128i x = _mm_cvtsi32_si128(c);
- return _mm_packus_epi16(
- _mm_srli_epi16(
- _mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x), 8),
- _mm_srli_epi16(
- _mm_sll_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x), 8));
+ return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+ _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
}
SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
- __m128i x = _mm_cvtsi32_si128(c + 8);
- return _mm_packus_epi16(
- _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x),
- _mm_srl_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x));
+ return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+ _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
}
SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
__m128i x = _mm_cvtsi32_si128(c + 8);
- return _mm_packs_epi16(
- _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x),
- _mm_sra_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x));
+ return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
+ _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
}
SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
@@ -468,20 +463,13 @@
to enforce that. */
#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
-#define v128_shl_n_8(a, c) \
- _mm_packus_epi16( \
- _mm_srli_epi16( \
- _mm_slli_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c), 8), \
- _mm_srli_epi16( \
- _mm_slli_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c), 8))
-#define v128_shr_n_u8(a, c) \
- _mm_packus_epi16( \
- _mm_srli_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c + 8), \
- _mm_srli_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c + 8))
-#define v128_shr_n_s8(a, c) \
- _mm_packs_epi16( \
- _mm_srai_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c + 8), \
- _mm_srai_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c + 8))
+#define v128_shl_n_8(a, c) \
+ _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v128_shr_n_u8(a, c) \
+ _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v128_shr_n_s8(a, c) \
+ _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
+ _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index b487303..bf92167 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -14,12 +14,10 @@
#include <arm_neon.h>
#include "./v64_intrinsics_arm.h"
+#include "aom_ports/arm.h"
-/* vzip in gcc is broken. Fixed in 4.6.1? */
-#if __GNUC__ && \
- ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ < \
- (4 << 16) + (6 << 8) + 1)
-#error vzip buggy in gcc. Get at least gcc 4.6.1.
+#ifdef AOM_INCOMPATIBLE_GCC
+#error Incompatible gcc
#endif
typedef int64x1_t v64;
@@ -51,7 +49,7 @@
SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
-SIMD_INLINE uint64_t v64_u64(v64 x) { return x; }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
return *((uint32_t *)p);
@@ -66,12 +64,16 @@
}
SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-#if __CC_ARM
+#if __clang__
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+ 0);
+#elif __CC_ARM
*(__packed uint32_t *)p) = a;
#elif __GNUC__
*((__attribute((packed)) uint32_t *)p) = a;
#else
- vst1_lane_u32((uint32_t*)p, vreinterpret_u32_s64(a), 0);
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+ 0);
#endif
}
@@ -91,13 +93,16 @@
vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
}
+// The following function requires an immediate.
+// Some compilers will check this if it's optimising, others wont.
SIMD_INLINE v64 v64_align(v64 a, v64 b, const unsigned int c) {
-#if __OPTIMIZE__
+#if __OPTIMIZE__ && !__clang__
return c ? vreinterpret_s64_s8(
vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
: b;
#else
- return c ? v64_from_64(b >> c * 8) | (a << (8 - c) * 8) : b;
+ return c ? v64_from_64((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)
+ : b;
#endif
}
@@ -121,21 +126,21 @@
int64x2_t r = vpaddlq_s32(vpaddlq_s16(
vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
- return vadd_s64(vget_high_s64(r), vget_low_s64(r));
+ return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
}
SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
int64x2_t r =
vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
- return vget_high_s64(r) + vget_low_s64(r);
+ return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
}
SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
- return vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+ return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
}
SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
- return vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
+ return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
}
typedef uint16x8_t sad64_internal;
@@ -151,12 +156,14 @@
SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
- return (uint32_t)(vget_high_u64(r) + vget_low_u64(r));
+ return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
}
typedef int64x1_t ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return 0; }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
+ return (ssd64_internal)(uint64_t)0;
+}
/* Implementation dependent return value. Result must be finalised with
* v64_ssd_u8_sum(). */
@@ -166,7 +173,9 @@
return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
}
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { return (uint32_t)s; }
+SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+ return (uint32_t)(uint64_t)s;
+}
SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
@@ -470,7 +479,9 @@
vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
}
-#if __OPTIMIZE__
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int c) {
return vshl_n_s64(a, c * 8);
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 502df23..b951492 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -86,9 +86,10 @@
_mm_storel_epi64((__m128i *)p, a);
}
+// The following function requires an immediate.
#if __OPTIMIZE__
#define v64_align(a, b, c) \
- (c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b;
+ ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
#else
#define v64_align(a, b, c) \
((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
@@ -388,25 +389,18 @@
SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
- return _mm_packus_epi16(
- _mm_srli_epi16(_mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a),
- _mm_cvtsi32_si128(c)),
- 8),
- _mm_setzero_si128());
+ return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+ _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
}
SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
- __m128i cp8 = _mm_cvtsi32_si128(c + 8);
- return _mm_packus_epi16(
- _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), cp8),
- _mm_setzero_si128());
+ return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+ _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
}
SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
- __m128i cp8 = _mm_cvtsi32_si128(c + 8);
return _mm_packs_epi16(
- _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), cp8),
- _mm_setzero_si128());
+ _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
}
SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
@@ -437,19 +431,12 @@
to enforce that. */
#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
-#define v64_shl_n_8(a, c) \
- _mm_packus_epi16( \
- _mm_srli_epi16( \
- _mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c), 8), \
- _mm_setzero_si128())
-#define v64_shr_n_u8(a, c) \
- _mm_packus_epi16( \
- _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), (c) + 8), \
- _mm_setzero_si128())
-#define v64_shr_n_s8(a, c) \
- _mm_packs_epi16( \
- _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), (c) + 8), \
- _mm_setzero_si128())
+#define v64_shl_n_8(a, c) \
+ _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v64_shr_n_u8(a, c) \
+ _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v64_shr_n_s8(a, c) \
+ _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
diff --git a/aom_util/aom_util.mk b/aom_util/aom_util.mk
index fcf0d70..14b484a 100644
--- a/aom_util/aom_util.mk
+++ b/aom_util/aom_util.mk
@@ -9,6 +9,7 @@
## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
##
+
UTIL_SRCS-yes += aom_util.mk
UTIL_SRCS-yes += aom_thread.c
UTIL_SRCS-yes += aom_thread.h
diff --git a/aom_util/debug_util.c b/aom_util/debug_util.c
index 0385df7..52389d0 100644
--- a/aom_util/debug_util.c
+++ b/aom_util/debug_util.c
@@ -1,17 +1,17 @@
/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include <stdio.h>
#include "aom_util/debug_util.h"
-
#define QUEUE_MAX_SIZE 2000000
static int result_queue[QUEUE_MAX_SIZE];
static int prob_queue[QUEUE_MAX_SIZE];
diff --git a/aom_util/debug_util.h b/aom_util/debug_util.h
index 7c2299a..c52e385 100644
--- a/aom_util/debug_util.h
+++ b/aom_util/debug_util.h
@@ -1,11 +1,12 @@
/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_UTIL_DEBUG_UTIL_H_
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 668c75f..1cf5272 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -8,8 +8,10 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+
#include "av1/common/clpf.h"
#include "./aom_dsp_rtcd.h"
+#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h"
int av1_clpf_maxbits(const AV1_COMMON *cm) {
@@ -27,58 +29,113 @@
return (8 + delta - (delta < 0)) >> 4;
}
-void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0,
- int y0, int sizex, int sizey, int width, int height,
- unsigned int strength) {
+void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
+ int dstride, int x0, int y0, int sizex, int sizey,
+ int width, int height, unsigned int strength) {
int x, y;
for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) {
- int X = src[y * stride + x];
- int A = src[AOMMAX(0, y - 1) * stride + x];
- int B = src[y * stride + AOMMAX(0, x - 2)];
- int C = src[y * stride + AOMMAX(0, x - 1)];
- int D = src[y * stride + AOMMIN(width - 1, x + 1)];
- int E = src[y * stride + AOMMIN(width - 1, x + 2)];
- int F = src[AOMMIN(height - 1, y + 1) * stride + x];
+ int X = src[y * sstride + x];
+ int A = src[AOMMAX(0, y - 1) * sstride + x];
+ int B = src[y * sstride + AOMMAX(0, x - 2)];
+ int C = src[y * sstride + AOMMAX(0, x - 1)];
+ int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
+ int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
+ int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
int delta;
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
- dst[y * stride + x] = X + delta;
+ dst[y * dstride + x] = X + delta;
}
}
}
+#if CONFIG_AOM_HIGHBITDEPTH
+// Identical to aom_clpf_block_c() apart from "src" and "dst".
+void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
+ int dstride, int x0, int y0, int sizex, int sizey,
+ int width, int height, unsigned int strength) {
+ int x, y;
+ for (y = y0; y < y0 + sizey; y++) {
+ for (x = x0; x < x0 + sizex; x++) {
+ int X = src[y * sstride + x];
+ int A = src[AOMMAX(0, y - 1) * sstride + x];
+ int B = src[y * sstride + AOMMAX(0, x - 2)];
+ int C = src[y * sstride + AOMMAX(0, x - 1)];
+ int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
+ int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
+ int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
+ int delta;
+ delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
+ dst[y * dstride + x] = X + delta;
+ }
+ }
+}
+#endif
+
// Return number of filtered blocks
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
- const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
- unsigned int fb_size_log2, uint8_t *blocks,
+ unsigned int fb_size_log2, uint8_t *blocks, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int,
unsigned int, unsigned int, uint8_t *)) {
/* Constrained low-pass filter (CLPF) */
int c, k, l, m, n;
- const int bs = MI_SIZE;
- int width = cm->mi_cols * bs;
- int height = cm->mi_rows * bs;
+ const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
+ const int suby = plane != AOM_PLANE_Y && frame->subsampling_y;
+ const int bs = (subx || suby) ? 4 : 8;
+ const int bslog = get_msb(bs);
+ int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width;
+ int height =
+ plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height;
int xpos, ypos;
- int stride_y = rec->y_stride;
- int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
- int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
+ const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride;
+ int dstride = bs;
+ const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
+ const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
int block_index = 0;
+ uint8_t *cache = NULL;
+ uint8_t **cache_ptr = NULL;
+ uint8_t **cache_dst = NULL;
+ int cache_idx = 0;
+ const int cache_size = num_fb_hor << (2 * fb_size_log2);
+ const int cache_blocks = cache_size / (bs * bs);
+ uint8_t *src_buffer =
+ plane != AOM_PLANE_Y
+ ? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
+ : frame->y_buffer;
+ uint8_t *dst_buffer;
+
+// Make buffer space for in-place filtering
+#if CONFIG_AOM_HIGHBITDEPTH
+ strength <<= (cm->bit_depth - 8);
+ CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth));
+ dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
+#else
+ CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
+ dst_buffer = cache;
+#endif
+ CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr)));
+ CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst)));
+ memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
// Iterate over all filter blocks
for (k = 0; k < num_fb_ver; k++) {
for (l = 0; l < num_fb_hor; l++) {
int h, w;
int allskip = 1;
+ const int xoff = l << fb_size_log2;
+ const int yoff = k << fb_size_log2;
for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
- xpos = (l << fb_size_log2) + n * bs;
- ypos = (k << fb_size_log2) + m * bs;
+ xpos = xoff + n * bs;
+ ypos = yoff + m * bs;
if (xpos < width && ypos < height) {
allskip &=
- cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
+ cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+ (xpos << subx) / MI_SIZE]
->mbmi.skip;
}
}
@@ -91,36 +148,145 @@
w += !w << fb_size_log2;
if (!allskip && // Do not filter the block if all is skip encoded
(!enable_fb_flag ||
- decision(k, l, rec, org, cm, bs, w / bs, h / bs, strength,
+ decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
fb_size_log2, blocks + block_index))) {
// Iterate over all smaller blocks inside the filter block
- for (m = 0; m < (h + bs - 1) / bs; m++) {
- for (n = 0; n < (w + bs - 1) / bs; n++) {
- xpos = (l << fb_size_log2) + n * bs;
- ypos = (k << fb_size_log2) + m * bs;
- if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
- ->mbmi.skip) {
- // Not skip block, apply the filter
- aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos,
- bs, bs, width, height, strength);
- } else { // Skip block, copy instead
- for (c = 0; c < bs; c++)
- *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) =
- *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos);
+ for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
+ for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
+ int sizex, sizey;
+ xpos = xoff + n * bs;
+ ypos = yoff + m * bs;
+ sizex = AOMMIN(width - xpos, bs);
+ sizey = AOMMIN(height - ypos, bs);
+ if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+ (xpos << subx) / MI_SIZE]
+ ->mbmi.skip) { // Not skip block
+ // Temporary buffering needed if filtering in-place
+ if (cache_ptr[cache_idx]) {
+// Copy filtered block back into the frame
+#if CONFIG_AOM_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
+ if (sizex == 8) {
+ for (c = 0; c < sizey; c++) {
+ *(uint64_t *)(d + c * sstride) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
+ *(uint64_t *)(d + c * sstride + 4) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
+ }
+ } else if (sizex == 4) {
+ for (c = 0; c < sizey; c++)
+ *(uint64_t *)(d + c * sstride) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
+ } else {
+ for (c = 0; c < sizey; c++)
+ memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2,
+ sizex);
+ }
+ } else {
+ if (sizex == 8)
+ for (c = 0; c < sizey; c++)
+ *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+ else if (sizex == 4)
+ for (c = 0; c < sizey; c++)
+ *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
+ *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+ else
+ for (c = 0; c < sizey; c++)
+ memcpy(cache_dst[cache_idx] + c * sstride,
+ cache_ptr[cache_idx] + c * bs, sizex);
+ }
+#else
+ if (sizex == 8)
+ for (c = 0; c < sizey; c++)
+ *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+ else if (sizex == 4)
+ for (c = 0; c < sizey; c++)
+ *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
+ *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+ else
+ for (c = 0; c < sizey; c++)
+ memcpy(cache_dst[cache_idx] + c * sstride,
+ cache_ptr[cache_idx] + c * bs, sizex);
+#endif
+ }
+#if CONFIG_AOM_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2;
+ dst_buffer =
+ CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos;
+ } else {
+ cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
+ dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
+ }
+#else
+ cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
+ dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
+#endif
+ cache_dst[cache_idx] = src_buffer + ypos * sstride + xpos;
+ if (++cache_idx >= cache_blocks) cache_idx = 0;
+
+// Apply the filter
+#if CONFIG_AOM_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
+ CONVERT_TO_SHORTPTR(dst_buffer), sstride,
+ dstride, xpos, ypos, sizex, sizey, width,
+ height, strength);
+ } else {
+ aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
+ ypos, sizex, sizey, width, height, strength);
+ }
+#else
+ aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
+ ypos, sizex, sizey, width, height, strength);
+#endif
}
}
}
- } else { // Entire filter block is skip, copy
- for (m = 0; m < h; m++)
- memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y +
- (l << fb_size_log2),
- rec->y_buffer + ((k << fb_size_log2) + m) * stride_y +
- (l << fb_size_log2),
- w);
}
block_index += !allskip; // Count number of blocks filtered
}
}
+ // Copy remaining blocks into the frame
+ for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
+ cache_idx++) {
+#if CONFIG_AOM_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
+ for (c = 0; c < bs; c++) {
+ *(uint64_t *)(d + c * sstride) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
+ if (bs == 8)
+ *(uint64_t *)(d + c * sstride + 4) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
+ }
+ } else {
+ for (c = 0; c < bs; c++)
+ if (bs == 4)
+ *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
+ *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+ else
+ *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+ }
+#else
+ for (c = 0; c < bs; c++)
+ if (bs == 4)
+ *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
+ *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+ else
+ *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+ *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+#endif
+ }
+
+ aom_free(cache);
+ aom_free(cache_ptr);
+ aom_free(cache_dst);
+
return block_index;
}
diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index 21671a1..8e4213b 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@@ -17,10 +17,10 @@
int av1_clpf_maxbits(const AV1_COMMON *cm);
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
- const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
- unsigned int fb_size_log2, uint8_t *blocks,
+ unsigned int fb_size_log2, uint8_t *blocks, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int,
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index faaf8ea..6fef4b7 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -10,187 +10,350 @@
*/
#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
-static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
- int y0, int sizey, int width, int height,
- unsigned int strength) {
- dst += x0 + y0 * stride;
- src += x0 + y0 * stride;
- {
- int bottom = height - 2 - y0;
- const v128 sp = v128_dup_8(strength);
- const v128 sm = v128_dup_8(-(int)strength);
- const v128 c8 = v128_dup_8(8);
- const v128 c128 = v128_dup_8(128);
+// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
+// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
+// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
+SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+ v128 f, v128 sp, v128 sm) {
+ // The difference will be 9 bit, offset by 128 so we can use saturated
+ // sub to avoid going to 16 bit temporarily before "strength" clipping.
+ const v128 c128 = v128_dup_8(128);
+ const v128 x = v128_add_8(c128, o);
+ const v128 c8 = v128_dup_8(8);
+ const v128 tmp = v128_add_8(
+ v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, c), x), sp), sm),
+ v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, d), x), sp), sm));
+ const v128 delta = v128_add_8(
+ v128_add_8(
+ v128_shl_8(
+ v128_add_8(
+ v128_max_s8(
+ v128_min_s8(v128_ssub_s8(v128_add_8(c128, a), x), sp),
+ sm),
+ v128_max_s8(
+ v128_min_s8(v128_ssub_s8(v128_add_8(c128, f), x), sp),
+ sm)),
+ 2),
+ v128_add_8(
+ v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, b), x), sp),
+ sm),
+ v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, e), x), sp),
+ sm))),
+ v128_add_8(v128_add_8(tmp, tmp), tmp));
+ return v128_add_8(
+ o,
+ v128_shr_s8(
+ v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
+ 4));
+}
- if (!x0) { // Clip left
- const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
- v64_from_64(0x0504030201000000LL));
- const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
- v64_from_64(0x0605040302010000LL));
- int y;
+// Process blocks of width 8, two lines at a time, 8 bit.
+static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
+ int dstride, int x0, int y0, int sizey, int width,
+ int height, unsigned int strength) {
+ const int bottom = height - 2 - y0;
+ const int right = width - 8 - x0;
+ const v128 sp = v128_dup_8(strength);
+ const v128 sm = v128_dup_8(-(int)strength);
+ DECLARE_ALIGNED(16, static const uint64_t,
+ b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
+ int y;
- for (y = 0; y < sizey; y += 2) {
- const v64 l1 = v64_load_aligned(src);
- const v64 l2 = v64_load_aligned(src + stride);
- v128 o = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, o);
- const v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
- const v128 b = v128_shuffle_8(x, b_shuff);
- const v128 c = v128_shuffle_8(x, c_shuff);
- const v128 d = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(src + 1),
- v64_load_unaligned(src + 1 + stride)));
- const v128 e = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(src + 2),
- v64_load_unaligned(src + 2 + stride)));
- const v128 f = v128_add_8(
- c128, v128_from_v64(l2, v64_load_aligned(
- src + ((y != bottom) + 1) * stride)));
+ dst += x0 + y0 * dstride;
+ src += x0 + y0 * sstride;
- const v128 tmp =
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
- const v128 delta = v128_add_8(
- v128_add_8(
- v128_shl_8(
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
- 2),
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- o = v128_add_8(
- o, v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
- delta, v128_zero()))),
- 4));
- v64_store_aligned(dst, v128_high_v64(o));
- v64_store_aligned(dst + stride, v128_low_v64(o));
- src += stride * 2;
- dst += stride * 2;
- }
- } else if (!(width - x0 - 8)) { // Clip right
- const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
- v64_from_64(0x0707060504030201LL));
- const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
- v64_from_64(0x0707070605040302LL));
- int y;
+ for (y = 0; y < sizey; y += 2) {
+ const v64 l1 = v64_load_aligned(src);
+ const v64 l2 = v64_load_aligned(src + sstride);
+ v128 o = v128_from_v64(l1, l2);
+ const v128 a =
+ v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
+ const v128 f = v128_from_v64(
+ l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
+ v128 b, c, d, e;
- for (y = 0; y < sizey; y += 2) {
- const v64 l1 = v64_load_aligned(src);
- const v64 l2 = v64_load_aligned(src + stride);
- v128 o = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, o);
- const v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
- const v128 b = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(src - 2),
- v64_load_unaligned(src - 2 + stride)));
- const v128 c = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(src - 1),
- v64_load_unaligned(src - 1 + stride)));
- const v128 d = v128_shuffle_8(x, d_shuff);
- const v128 e = v128_shuffle_8(x, e_shuff);
- const v128 f = v128_add_8(
- c128, v128_from_v64(l2, v64_load_aligned(
- src + ((y != bottom) + 1) * stride)));
-
- const v128 tmp =
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
- const v128 delta = v128_add_8(
- v128_add_8(
- v128_shl_8(
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
- 2),
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- o = v128_add_8(
- o, v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
- delta, v128_zero()))),
- 4));
- v64_store_aligned(dst, v128_high_v64(o));
- v64_store_aligned(dst + stride, v128_low_v64(o));
- src += stride * 2;
- dst += stride * 2;
- }
- } else { // No left/right clipping
- int y;
- for (y = 0; y < sizey; y += 2) {
- const v64 l1 = v64_load_aligned(src);
- const v64 l2 = v64_load_aligned(src + stride);
- v128 o = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, o);
- const v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
- const v128 b = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(src - 2),
- v64_load_unaligned(src - 2 + stride)));
- const v128 c = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(src - 1),
- v64_load_unaligned(src - 1 + stride)));
- const v128 d = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(src + 1),
- v64_load_unaligned(src + 1 + stride)));
- const v128 e = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(src + 2),
- v64_load_unaligned(src + 2 + stride)));
- const v128 f = v128_add_8(
- c128, v128_from_v64(l2, v64_load_aligned(
- src + ((y != bottom) + 1) * stride)));
-
- const v128 tmp =
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
- const v128 delta = v128_add_8(
- v128_add_8(
- v128_shl_8(
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
- 2),
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- o = v128_add_8(
- o, v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
- delta, v128_zero()))),
- 4));
- v64_store_aligned(dst, v128_high_v64(o));
- v64_store_aligned(dst + stride, v128_low_v64(o));
- src += stride * 2;
- dst += stride * 2;
- }
+ if (x0) {
+ b = v128_from_v64(v64_load_unaligned(src - 2),
+ v64_load_unaligned(src - 2 + sstride));
+ c = v128_from_v64(v64_load_unaligned(src - 1),
+ v64_load_unaligned(src - 1 + sstride));
+ } else { // Left clipping
+ b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+ c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
}
+ if (right) {
+ d = v128_from_v64(v64_load_unaligned(src + 1),
+ v64_load_unaligned(src + 1 + sstride));
+ e = v128_from_v64(v64_load_unaligned(src + 2),
+ v64_load_unaligned(src + 2 + sstride));
+ } else { // Right clipping
+ d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+ e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
+ }
+
+ o = calc_delta(o, a, b, c, d, e, f, sp, sm);
+ v64_store_aligned(dst, v128_high_v64(o));
+ v64_store_aligned(dst + dstride, v128_low_v64(o));
+ src += sstride * 2;
+ dst += dstride * 2;
}
}
-void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
- int x0, int y0, int sizex, int sizey, int width,
- int height, unsigned int strength) {
- // TODO(stemidts):
- // A sizex different from 8 will only be needed if CLPF is extended to chroma.
- // This will only be used if 4:2:0 and width not a multiple of 16 and along
- // the right edge only, so we can fall back to the plain C implementation in
- // this case. If not extended to chroma, this test will be redundant.
- if (sizex != 8 || width < 16) { // Fallback to C if frame width < 16
- aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
- strength);
- } else {
- clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
+// Process blocks of width 4, four lines at a time, 8 bit.
+static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
+ int dstride, int x0, int y0, int sizey, int width,
+ int height, unsigned int strength) {
+ const v128 sp = v128_dup_8(strength);
+ const v128 sm = v128_dup_8(-(int)strength);
+ const int right = width - 4 - x0;
+ const int bottom = height - 4 - y0;
+ DECLARE_ALIGNED(16, static const uint64_t,
+ b_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ c_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ d_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ e_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
+ int y;
+
+ dst += x0 + y0 * dstride;
+ src += x0 + y0 * sstride;
+
+ for (y = 0; y < sizey; y += 4) {
+ const uint32_t l0 = u32_load_aligned(src - (y != -y0) * sstride);
+ const uint32_t l1 = u32_load_aligned(src);
+ const uint32_t l2 = u32_load_aligned(src + sstride);
+ const uint32_t l3 = u32_load_aligned(src + 2 * sstride);
+ const uint32_t l4 = u32_load_aligned(src + 3 * sstride);
+ const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
+ v128 o = v128_from_32(l1, l2, l3, l4);
+ const v128 a = v128_from_32(l0, l1, l2, l3);
+ const v128 f = v128_from_32(l2, l3, l4, l5);
+ v128 b, c, d, e;
+
+ if (x0) {
+ b = v128_from_32(u32_load_unaligned(src - 2),
+ u32_load_unaligned(src + sstride - 2),
+ u32_load_unaligned(src + 2 * sstride - 2),
+ u32_load_unaligned(src + 3 * sstride - 2));
+ c = v128_from_32(u32_load_unaligned(src - 1),
+ u32_load_unaligned(src + sstride - 1),
+ u32_load_unaligned(src + 2 * sstride - 1),
+ u32_load_unaligned(src + 3 * sstride - 1));
+ } else { // Left clipping
+ b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+ c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
+ }
+ if (right) {
+ d = v128_from_32(u32_load_unaligned(src + 1),
+ u32_load_unaligned(src + sstride + 1),
+ u32_load_unaligned(src + 2 * sstride + 1),
+ u32_load_unaligned(src + 3 * sstride + 1));
+ e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
+ u32_load_unaligned(src + sstride + 2),
+ u32_load_unaligned(src + 2 * sstride + 2),
+ u32_load_unaligned(src + 3 * sstride + 2));
+ } else { // Right clipping
+ d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+ e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
+ }
+
+ o = calc_delta(o, a, b, c, d, e, f, sp, sm);
+ u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
+ u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
+ u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
+ u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
}
}
+
+void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
+ int dstride, int x0, int y0, int sizex,
+ int sizey, int width, int height,
+ unsigned int strength) {
+ if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
+ // Fallback to C for odd sizes:
+ // * block widths not 4 or 8
+ // * block heights not a multiple of 4 if the block width is 4
+ aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
+ height, strength);
+ } else {
+ (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, y0,
+ sizey, width, height, strength);
+ }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
+// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
+// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
+SIMD_INLINE v128 calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+ v128 f, v128 sp, v128 sm) {
+ const v128 c8 = v128_dup_16(8);
+ const v128 tmp =
+ v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm),
+ v128_max_s16(v128_min_s16(v128_sub_16(d, o), sp), sm));
+ const v128 delta = v128_add_16(
+ v128_add_16(
+ v128_shl_16(
+ v128_add_16(
+ v128_max_s16(v128_min_s16(v128_sub_16(a, o), sp), sm),
+ v128_max_s16(v128_min_s16(v128_sub_16(f, o), sp), sm)),
+ 2),
+ v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm),
+ v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))),
+ v128_add_16(v128_add_16(tmp, tmp), tmp));
+ return v128_add_16(
+ o, v128_shr_s16(
+ v128_add_16(
+ c8, v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
+ 4));
+}
+
+static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+ v128 f, uint16_t *dst, v128 sp, v128 sm,
+ int dstride) {
+ o = calc_delta_hbd(o, a, b, c, d, e, f, sp, sm);
+ v64_store_aligned(dst, v128_high_v64(o));
+ v64_store_aligned(dst + dstride, v128_low_v64(o));
+}
+
+static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+ v128 f, uint16_t *dst, v128 sp, v128 sm) {
+ v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, sp, sm));
+}
+
+// Process blocks of width 4, two lines at time.
+SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
+ int sstride, int dstride, int x0, int y0,
+ int sizey, int width, int height,
+ unsigned int strength) {
+ const v128 sp = v128_dup_16(strength);
+ const v128 sm = v128_dup_16(-(int)strength);
+ const int right = width - 4 - x0;
+ const int bottom = height - 2 - y0;
+ DECLARE_ALIGNED(16, static const uint64_t,
+ b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ d_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ e_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
+ int y;
+
+ dst += x0 + y0 * dstride;
+ src += x0 + y0 * sstride;
+
+ for (y = 0; y < sizey; y += 2) {
+ const v64 l1 = v64_load_aligned(src);
+ const v64 l2 = v64_load_aligned(src + sstride);
+ v128 o = v128_from_v64(l1, l2);
+ const v128 a =
+ v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
+ const v128 f = v128_from_v64(
+ l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
+ v128 b, c, d, e;
+
+ if (x0) {
+ b = v128_from_v64(v64_load_unaligned(src - 2),
+ v64_load_unaligned(src - 2 + sstride));
+ c = v128_from_v64(v64_load_unaligned(src - 1),
+ v64_load_unaligned(src - 1 + sstride));
+ } else { // Left clipping
+ b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+ c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
+ }
+ if (right) {
+ d = v128_from_v64(v64_load_unaligned(src + 1),
+ v64_load_unaligned(src + 1 + sstride));
+ e = v128_from_v64(v64_load_unaligned(src + 2),
+ v64_load_unaligned(src + 2 + sstride));
+ } else { // Right clipping
+ d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+ e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
+ }
+ calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
+ src += sstride * 2;
+ dst += dstride * 2;
+ }
+}
+
+// The most simple case. Start here if you need to understand the functions.
+SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
+ int dstride, int x0, int y0, int sizey,
+ int width, int height, unsigned int strength) {
+ const v128 sp = v128_dup_16(strength);
+ const v128 sm = v128_dup_16(-(int)strength);
+ const int right = width - 8 - x0;
+ const int bottom = height - 2 - y0;
+ DECLARE_ALIGNED(16, static const uint64_t,
+ b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ d_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ e_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL };
+ int y;
+
+ dst += x0 + y0 * dstride;
+ src += x0 + y0 * sstride;
+
+ // Read 8 set of pixels at a time. Clipping along upper and lower
+ // edges is handled by reading the upper or lower line twice.
+ // Clipping along the left and right edges is handled by shuffle
+ // instructions doing shift and pad.
+ for (y = 0; y < sizey; y++) {
+ const v128 o = v128_load_aligned(src);
+ const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
+ const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
+ v128 b, c, d, e;
+
+ if (x0) {
+ b = v128_load_unaligned(src - 2);
+ c = v128_load_unaligned(src - 1);
+ } else { // Left clipping
+ b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+ c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
+ }
+ if (right) {
+ d = v128_load_unaligned(src + 1);
+ e = v128_load_unaligned(src + 2);
+ } else { // Right clipping
+ d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+ e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
+ }
+ calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
+ src += sstride;
+ dst += dstride;
+ }
+}
+
+void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
+ int sstride, int dstride, int x0, int y0,
+ int sizex, int sizey, int width, int height,
+ unsigned int strength) {
+ if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
+ // Fallback to C for odd sizes:
+ // * block width not 4 or 8
+ // * block heights not a multiple of 2 if the block width is 4
+ aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
+ width, height, strength);
+ } else {
+ (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
+ src, dst, sstride, dstride, x0, y0, sizey, width, height, strength);
+ }
+}
+#endif
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 2bb680a..98f4f51 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -153,7 +153,9 @@
#if CONFIG_CLPF
int clpf_numblocks;
int clpf_size;
- int clpf_strength;
+ int clpf_strength_y;
+ int clpf_strength_u;
+ int clpf_strength_v;
uint8_t *clpf_blocks;
#endif
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index dc18944..7daeb5d 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -29,6 +29,7 @@
#include "av1/common/alloccommon.h"
#if CONFIG_CLPF
+#include "aom/aom_image.h"
#include "av1/common/clpf.h"
#endif
#include "av1/common/common.h"
@@ -2046,8 +2047,10 @@
#if CONFIG_CLPF
static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
cm->clpf_blocks = 0;
- cm->clpf_strength = aom_rb_read_literal(rb, 2);
- if (cm->clpf_strength) {
+ cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
+ cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
+ cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
+ if (cm->clpf_strength_y) {
cm->clpf_size = aom_rb_read_literal(rb, 2);
if (cm->clpf_size) {
int i;
@@ -3928,20 +3931,23 @@
#endif // CONFIG_LOOP_RESTORATION
#if CONFIG_CLPF
- if (cm->clpf_strength && !cm->skip_loop_filter) {
- YV12_BUFFER_CONFIG dst; // Buffer for the result
-
- dst = pbi->cur_buf->buf;
- CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
-
- av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
- cm->clpf_strength + (cm->clpf_strength == 3),
- 4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
-
- // Copy result
- memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
- dst.y_height * dst.y_stride);
- aom_free(dst.y_buffer);
+ if (!cm->skip_loop_filter) {
+ const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
+ if (cm->clpf_strength_y) {
+ av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
+ cm->clpf_strength_y + (cm->clpf_strength_y == 3),
+ 4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
+ }
+ if (cm->clpf_strength_u) {
+ av1_clpf_frame(frame, NULL, cm, 0,
+ cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
+ AOM_PLANE_U, NULL);
+ }
+ if (cm->clpf_strength_v) {
+ av1_clpf_frame(frame, NULL, cm, 0,
+ cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
+ AOM_PLANE_V, NULL);
+ }
}
if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
#endif
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index dab1008..1438a56 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -1454,6 +1454,7 @@
};
mi->bmi[j].as_mv[0].as_int = block[0].as_int;
+ mi->bmi[j].as_mode = b_mode;
if (is_compound) mi->bmi[j].as_mv[1].as_int = block[1].as_int;
if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 2b846e8..6774bb2 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -662,7 +662,7 @@
if (t != ONE_TOKEN) {
int len = UNCONSTRAINED_NODES - p->skip_eob_node;
- av1_write_tree(w, av1_coef_con_tree,
+ aom_write_tree(w, av1_coef_con_tree,
av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1], v,
n - len, 0);
}
@@ -836,7 +836,7 @@
const struct segmentation_probs *segp,
int segment_id) {
if (seg->enabled && seg->update_map)
- av1_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0);
+ aom_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0);
}
// This function encodes the reference frame
@@ -2590,8 +2590,10 @@
#if CONFIG_CLPF
static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
- aom_wb_write_literal(wb, cm->clpf_strength, 2);
- if (cm->clpf_strength) {
+ aom_wb_write_literal(wb, cm->clpf_strength_y, 2);
+ aom_wb_write_literal(wb, cm->clpf_strength_u, 2);
+ aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
+ if (cm->clpf_strength_y) {
aom_wb_write_literal(wb, cm->clpf_size, 2);
if (cm->clpf_size) {
int i;
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 2156032..57b42a8 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -164,9 +164,6 @@
// Store the second best motion vector during full-pixel motion search
int_mv second_best_mv;
- // Strong color activity detection. Used in RTC coding mode to enhance
- // the visual quality at the boundary of moving color objects.
- uint8_t color_sensitivity[2];
// use default transform and skip transform type search for intra modes
int use_default_intra_tx_type;
diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
index 4221505..1d498f1 100644
--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c
@@ -11,16 +11,17 @@
#include "av1/common/clpf.h"
#include "./aom_dsp_rtcd.h"
+#include "aom/aom_image.h"
#include "aom/aom_integer.h"
#include "av1/common/quant_common.h"
// Calculate the error of a filtered and unfiltered block
void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
int ostride, int x0, int y0, int width, int height,
- int *sum0, int *sum1, unsigned int strength) {
+ int *sum0, int *sum1, unsigned int strength, int size) {
int x, y;
- for (y = y0; y < y0 + 8; y++) {
- for (x = x0; x < x0 + 8; x++) {
+ for (y = y0; y < y0 + size; y++) {
+ for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x];
int X = rec[y * rstride + x];
int A = rec[AOMMAX(0, y - 1) * rstride + x];
@@ -39,11 +40,11 @@
void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
- int width, int height, int *sum) {
+ int width, int height, int *sum, int size) {
int x, y;
- for (y = y0; y < y0 + 8; y++) {
- for (x = x0; x < x0 + 8; x++) {
+ for (y = y0; y < y0 + size; y++) {
+ for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x];
int X = rec[y * rstride + x];
int A = rec[AOMMAX(0, y - 1) * rstride + x];
@@ -66,21 +67,94 @@
}
}
+#if CONFIG_AOM_HIGHBITDEPTH
+// Identical to aom_clpf_detect_c() apart from "rec" and "org".
+void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
+ int rstride, int ostride, int x0, int y0, int width,
+ int height, int *sum0, int *sum1,
+ unsigned int strength, int shift, int size) {
+ int x, y;
+ for (y = y0; y < y0 + size; y++) {
+ for (x = x0; x < x0 + size; x++) {
+ int O = org[y * ostride + x] >> shift;
+ int X = rec[y * rstride + x] >> shift;
+ int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
+ int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
+ int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
+ int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
+ int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
+ int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
+ int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength >> shift);
+ int Y = X + delta;
+ *sum0 += (O - X) * (O - X);
+ *sum1 += (O - Y) * (O - Y);
+ }
+ }
+}
+
+// aom_clpf_detect_multi_c() apart from "rec" and "org".
+void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
+ int rstride, int ostride, int x0, int y0,
+ int width, int height, int *sum, int shift,
+ int size) {
+ int x, y;
+
+ for (y = y0; y < y0 + size; y++) {
+ for (x = x0; x < x0 + size; x++) {
+ int O = org[y * ostride + x] >> shift;
+ int X = rec[y * rstride + x] >> shift;
+ int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
+ int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
+ int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
+ int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
+ int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
+ int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
+ int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1);
+ int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2);
+ int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4);
+ int F1 = X + delta1;
+ int F2 = X + delta2;
+ int F3 = X + delta3;
+ sum[0] += (O - X) * (O - X);
+ sum[1] += (O - F1) * (O - F1);
+ sum[2] += (O - F2) * (O - F2);
+ sum[3] += (O - F3) * (O - F3);
+ }
+ }
+}
+#endif
+
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength,
unsigned int fb_size_log2, uint8_t *res) {
int m, n, sum0 = 0, sum1 = 0;
+
for (m = 0; m < h; m++) {
for (n = 0; n < w; n++) {
int xpos = (l << fb_size_log2) + n * block_size;
int ypos = (k << fb_size_log2) + m * block_size;
- const int bs = MAX_MIB_SIZE;
- if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
- ->mbmi.skip)
+ if (!cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
+ ->mbmi.skip) {
+#if CONFIG_AOM_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
+ CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
+ org->y_stride, xpos, ypos, rec->y_crop_width,
+ rec->y_crop_height, &sum0, &sum1, strength,
+ cm->bit_depth - 8, block_size);
+ } else {
+ aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
+ org->y_stride, xpos, ypos, rec->y_crop_width,
+ rec->y_crop_height, &sum0, &sum1, strength,
+ block_size);
+ }
+#else
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
- rec->y_crop_height, &sum0, &sum1, strength);
+ rec->y_crop_height, &sum0, &sum1, strength, block_size);
+#endif
+ }
}
}
*res = sum1 < sum0;
@@ -90,6 +164,7 @@
// Calculate the square error of all filter settings. Result:
// res[0][0] : unfiltered
// res[0][1-3] : strength=1,2,4, no signals
+// (Only for luma:)
// res[1][0] : (bit count, fb size = 128)
// res[1][1-3] : strength=1,2,4, fb size = 128
// res[2][0] : (bit count, fb size = 64)
@@ -99,12 +174,28 @@
static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
unsigned int block_size, unsigned int fb_size_log2, int w,
- int h, int64_t res[4][4]) {
+ int h, int64_t res[4][4], int plane) {
int c, m, n, filtered = 0;
int sum[4];
+ const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
+ const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
int bslog = get_msb(block_size);
+ uint8_t *rec_buffer =
+ plane != AOM_PLANE_Y
+ ? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
+ : rec->y_buffer;
+ uint8_t *org_buffer =
+ plane != AOM_PLANE_Y
+ ? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
+ : org->y_buffer;
+ int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
+ int rec_height =
+ plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
+ int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
+ int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
sum[0] = sum[1] = sum[2] = sum[3] = 0;
- if (fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
+ if (plane == AOM_PLANE_Y &&
+ fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
fb_size_log2--;
@@ -119,16 +210,17 @@
oldfiltered = res[i][0];
res[i][0] = 0;
- filtered =
- clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, res);
+ filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
+ res, plane);
if (1 << (fb_size_log2 - bslog) < w)
filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
- fb_size_log2, w2, h1, res);
+ fb_size_log2, w2, h1, res, plane);
if (1 << (fb_size_log2 - bslog) < h) {
filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
- fb_size_log2, w1, h2, res);
- filtered |= clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2),
- rec, org, cm, block_size, fb_size_log2, w2, h2, res);
+ fb_size_log2, w1, h2, res, plane);
+ filtered |=
+ clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org,
+ cm, block_size, fb_size_log2, w2, h2, res, plane);
}
res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
@@ -142,18 +234,31 @@
for (n = 0; n < w; n++) {
int xpos = x + n * block_size;
int ypos = y + m * block_size;
- if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride +
- xpos / MAX_MIB_SIZE]
+ if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+ (xpos << subx) / MI_SIZE]
->mbmi.skip) {
- aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride,
- org->y_stride, xpos, ypos, rec->y_crop_width,
- rec->y_crop_height, sum);
+#if CONFIG_AOM_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ aom_clpf_detect_multi_hbd(
+ CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
+ rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
+ cm->bit_depth - 8, block_size);
+ } else {
+ aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+ xpos, ypos, rec_width, rec_height, sum,
+ block_size);
+ }
+#else
+ aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+ xpos, ypos, rec_width, rec_height, sum,
+ block_size);
+#endif
filtered = 1;
}
}
}
- for (c = 0; c < 4; c++) {
+ for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) {
res[c][0] += sum[0];
res[c][1] += sum[1];
res[c][2] += sum[2];
@@ -164,59 +269,69 @@
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
- int *best_strength, int *best_bs) {
+ int *best_strength, int *best_bs, int plane) {
int c, j, k, l;
int64_t best, sums[4][4];
- int width = rec->y_crop_width, height = rec->y_crop_height;
- const int bs = MAX_MIB_SIZE;
+ int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
+ int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
+ const int bs = MI_SIZE;
+ const int bslog = get_msb(bs);
int fb_size_log2 = get_msb(MAX_FB_SIZE);
int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
memset(sums, 0, sizeof(sums));
- for (k = 0; k < num_fb_ver; k++) {
- for (l = 0; l < num_fb_hor; l++) {
- // Calculate the block size after frame border clipping
- int h =
- AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
- int w =
- AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
- h += !h << fb_size_log2;
- w += !w << fb_size_log2;
- clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, bs,
- fb_size_log2, w / bs, h / bs, sums);
+ if (plane != AOM_PLANE_Y)
+ // Use a block size of MI_SIZE regardless of the subsampling. This
+ // This is accurate enough to determine the best strength and
+ // we don't need to add SIMD optimisations for 4x4 blocks.
+ clpf_rdo(0, 0, rec, org, cm, bs, fb_size_log2, width >> bslog,
+ height >> bslog, sums, plane);
+ else
+ for (k = 0; k < num_fb_ver; k++) {
+ for (l = 0; l < num_fb_hor; l++) {
+ // Calculate the block size after frame border clipping
+ int h =
+ AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
+ int w =
+ AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
+ h += !h << fb_size_log2;
+ w += !w << fb_size_log2;
+ clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, MI_SIZE,
+ fb_size_log2, w >> bslog, h >> bslog, sums, plane);
+ }
}
- }
+
+ if (plane != AOM_PLANE_Y) // Slightly favour unfiltered chroma
+ sums[0][0] -= sums[0][0] >> 7;
for (j = 0; j < 4; j++) {
static const double lambda_square[] = {
- // exp((i - 15.4244) / 8.4010)
- 0.159451, 0.179607, 0.202310, 0.227884, 0.256690, 0.289138, 0.325687,
- 0.366856, 0.413230, 0.465465, 0.524303, 0.590579, 0.665233, 0.749323,
- 0.844044, 0.950737, 1.070917, 1.206289, 1.358774, 1.530533, 1.724004,
- 1.941931, 2.187406, 2.463911, 2.775368, 3.126195, 3.521370, 3.966498,
- 4.467893, 5.032669, 5.668837, 6.385421, 7.192586, 8.101784, 9.125911,
- 10.27949, 11.57890, 13.04256, 14.69124, 16.54832, 18.64016, 20.99641,
- 23.65052, 26.64013, 30.00764, 33.80084, 38.07352, 42.88630, 48.30746,
- 54.41389, 61.29221, 69.04002, 77.76720, 87.59756, 98.67056, 111.1432,
- 125.1926, 141.0179, 158.8436, 178.9227, 201.5399, 227.0160, 255.7126,
- 288.0366
+ // exp(x / 8.5)
+ 1.0000, 1.1248, 1.2653, 1.4232, 1.6009, 1.8008, 2.0256, 2.2785,
+ 2.5630, 2.8830, 3.2429, 3.6478, 4.1032, 4.6155, 5.1917, 5.8399,
+ 6.5689, 7.3891, 8.3116, 9.3492, 10.516, 11.829, 13.306, 14.967,
+ 16.836, 18.938, 21.302, 23.962, 26.953, 30.318, 34.103, 38.361,
+ 43.151, 48.538, 54.598, 61.414, 69.082, 77.706, 87.408, 98.320,
+ 110.59, 124.40, 139.93, 157.40, 177.05, 199.16, 224.02, 251.99,
+ 283.45, 318.84, 358.65, 403.42, 453.79, 510.45, 574.17, 645.86,
+ 726.49, 817.19, 919.22, 1033.9, 1163.0, 1308.2, 1471.6, 1655.3
};
// Estimate the bit costs and adjust the square errors
double lambda =
lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2];
- int i, cost = (int)((1.2 * lambda * (sums[j][0] + 2 + 2 * (j > 0)) + 0.5));
+ int i, cost = (int)((lambda * (sums[j][0] + 6 + 2 * (j > 0)) + 0.5));
for (i = 0; i < 4; i++)
sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i;
}
best = (int64_t)1 << 62;
- for (c = 0; c < 4; c++)
+ for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++)
for (j = 0; j < 4; j++)
if ((!c || j) && sums[c][j] < best) best = sums[c][j];
best &= 15;
- *best_bs = (best > 3) * (5 + (best < 12) + (best < 8));
+ if (best_bs) *best_bs = (best > 3) * (5 + (best < 12) + (best < 8));
*best_strength = best ? 1 << ((best - 1) & 3) : 0;
}
diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
index 3dd5478..bb85fbc 100644
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h
@@ -21,6 +21,6 @@
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
- int *best_strength, int *best_bs);
+ int *best_strength, int *best_bs, int plane);
#endif
diff --git a/av1/encoder/clpf_rdo_simd.h b/av1/encoder/clpf_rdo_simd.h
index abbbe7c..7c07329 100644
--- a/av1/encoder/clpf_rdo_simd.h
+++ b/av1/encoder/clpf_rdo_simd.h
@@ -9,496 +9,278 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_simd.h"
+#include "aom_ports/mem.h"
+
+SIMD_INLINE void calc_diff(v128 o, v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
+ v128 *f) {
+ // The difference will be 9 bit, offset by 128 so we can use saturated
+ // sub to avoid going to 16 bit temporarily before "strength" clipping.
+ const v128 c128 = v128_dup_8(128);
+ v128 x = v128_add_8(c128, o);
+ *a = v128_ssub_s8(v128_add_8(c128, *a), x);
+ *b = v128_ssub_s8(v128_add_8(c128, *b), x);
+ *c = v128_ssub_s8(v128_add_8(c128, *c), x);
+ *d = v128_ssub_s8(v128_add_8(c128, *d), x);
+ *e = v128_ssub_s8(v128_add_8(c128, *e), x);
+ *f = v128_ssub_s8(v128_add_8(c128, *f), x);
+}
+
+SIMD_INLINE v128 delta_kernel(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+ v128 f, v128 sp, v128 sm) {
+ const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(c, sp), sm),
+ v128_max_s8(v128_min_s8(d, sp), sm));
+ const v128 delta = v128_add_8(
+ v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, sp), sm),
+ v128_max_s8(v128_min_s8(f, sp), sm)),
+ 2),
+ v128_add_8(v128_max_s8(v128_min_s8(b, sp), sm),
+ v128_max_s8(v128_min_s8(e, sp), sm))),
+ v128_add_8(v128_add_8(tmp, tmp), tmp));
+
+ return v128_add_8(
+ o, v128_shr_s8(
+ v128_add_8(v128_dup_8(8),
+ v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
+ 4));
+}
+
+SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+ v128 f, v128 sp, v128 sm) {
+ calc_diff(o, &a, &b, &c, &d, &e, &f);
+ return delta_kernel(o, a, b, c, d, e, f, sp, sm);
+}
+
+SIMD_INLINE void clip_sides(v128 *b, v128 *c, v128 *d, v128 *e, int left,
+ int right) {
+ DECLARE_ALIGNED(16, static const uint64_t,
+ b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
+ DECLARE_ALIGNED(16, static const uint64_t,
+ e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
+
+ if (!left) { // Left clipping
+ *b = v128_shuffle_8(*b, v128_load_aligned(b_shuff));
+ *c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
+ }
+ if (!right) { // Right clipping
+ *d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
+ *e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
+ }
+}
+
+SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
+ int rstride, int ostride, int x0, int y0,
+ int bottom, int right, int y, v128 *o, v128 *r,
+ v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
+ v128 *f) {
+ const v64 k1 = v64_load_aligned(org);
+ const v64 k2 = v64_load_aligned(org + ostride);
+ const v64 l1 = v64_load_aligned(rec);
+ const v64 l2 = v64_load_aligned(rec + rstride);
+ *o = v128_from_v64(k1, k2);
+ *r = v128_from_v64(l1, l2);
+ *a = v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1);
+ *f = v128_from_v64(l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride));
+ *b = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
+ v64_load_unaligned(rec - 2 * !!x0 + rstride));
+ *c = v128_from_v64(v64_load_unaligned(rec - !!x0),
+ v64_load_unaligned(rec - !!x0 + rstride));
+ *d = v128_from_v64(v64_load_unaligned(rec + !!right),
+ v64_load_unaligned(rec + !!right + rstride));
+ *e = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
+ v64_load_unaligned(rec + 2 * !!right + rstride));
+ clip_sides(b, c, d, e, x0, right);
+}
void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum0, int *sum1,
- unsigned int strength) {
- ssd128_internal ssd0 = v128_ssd_u8_init();
- ssd128_internal ssd1 = v128_ssd_u8_init();
- const v128 c128 = v128_dup_8(128);
+ unsigned int strength, int size) {
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
+ const int right = width - 8 - x0;
const int bottom = height - 2 - y0;
+ ssd128_internal ssd0 = v128_ssd_u8_init();
+ ssd128_internal ssd1 = v128_ssd_u8_init();
+ int y;
+
+ if (size != 8) { // Fallback to plain C
+ aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
+ sum1, strength, size);
+ return;
+ }
rec += x0 + y0 * rstride;
org += x0 + y0 * ostride;
- if (!x0) { // Clip left
- const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
- v64_from_64(0x0504030201000000LL));
- const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
- v64_from_64(0x0605040302010000LL));
- int y;
-
- for (y = 0; y < 8; y += 2) {
- const v64 k1 = v64_load_aligned(org);
- const v64 k2 = v64_load_aligned(org + ostride);
- const v64 l1 = v64_load_aligned(rec);
- const v64 l2 = v64_load_aligned(rec + rstride);
- v128 o = v128_from_v64(k1, k2);
- const v128 q = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, q);
- const v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
- const v128 b = v128_shuffle_8(x, b_shuff);
- const v128 c = v128_shuffle_8(x, c_shuff);
- const v128 d = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(rec + 1),
- v64_load_unaligned(rec + 1 + rstride)));
- const v128 e = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(rec + 2),
- v64_load_unaligned(rec + 2 + rstride)));
- const v128 f = v128_add_8(
- c128, v128_from_v64(
- l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-
- const v128 tmp =
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
- v128 delta = v128_add_8(
- v128_add_8(
- v128_shl_8(
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
-
- delta = v128_shr_s8(
- v128_add_8(v128_dup_8(8),
- v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
- 4);
- ssd0 = v128_ssd_u8(ssd0, o, q);
- ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
- rec += rstride * 2;
- org += ostride * 2;
- }
- } else if (!(width - x0 - 8)) { // Clip right
- const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
- v64_from_64(0x0707060504030201LL));
- const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
- v64_from_64(0x0707070605040302LL));
- int y;
-
- for (y = 0; y < 8; y += 2) {
- const v64 k1 = v64_load_aligned(org);
- const v64 k2 = v64_load_aligned(org + ostride);
- const v64 l1 = v64_load_aligned(rec);
- const v64 l2 = v64_load_aligned(rec + rstride);
- v128 o = v128_from_v64(k1, k2);
- const v128 q = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, q);
- const v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
- const v128 b = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(rec - 2),
- v64_load_unaligned(rec - 2 + rstride)));
- const v128 c = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(rec - 1),
- v64_load_unaligned(rec - 1 + rstride)));
- const v128 d = v128_shuffle_8(x, d_shuff);
- const v128 e = v128_shuffle_8(x, e_shuff);
- const v128 f = v128_add_8(
- c128, v128_from_v64(
- l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-
- const v128 tmp =
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
- v128 delta = v128_add_8(
- v128_add_8(
- v128_shl_8(
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- delta = v128_shr_s8(
- v128_add_8(v128_dup_8(8),
- v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
- 4);
- ssd0 = v128_ssd_u8(ssd0, o, q);
- ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
- rec += rstride * 2;
- org += ostride * 2;
- }
- } else { // No left/right clipping
- int y;
- for (y = 0; y < 8; y += 2) {
- const v64 k1 = v64_load_aligned(org);
- const v64 k2 = v64_load_aligned(org + ostride);
- const v64 l1 = v64_load_aligned(rec);
- const v64 l2 = v64_load_aligned(rec + rstride);
- v128 o = v128_from_v64(k1, k2);
- const v128 q = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, q);
- const v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
- const v128 b = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(rec - 2),
- v64_load_unaligned(rec - 2 + rstride)));
- const v128 c = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(rec - 1),
- v64_load_unaligned(rec - 1 + rstride)));
- const v128 d = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(rec + 1),
- v64_load_unaligned(rec + 1 + rstride)));
- const v128 e = v128_add_8(
- c128, v128_from_v64(v64_load_unaligned(rec + 2),
- v64_load_unaligned(rec + 2 + rstride)));
- const v128 f = v128_add_8(
- c128, v128_from_v64(
- l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-
- const v128 tmp =
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
- v128 delta = v128_add_8(
- v128_add_8(
- v128_shl_8(
- v128_add_8(
- v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
- v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- delta = v128_shr_s8(
- v128_add_8(v128_dup_8(8),
- v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
- 4);
-
- ssd0 = v128_ssd_u8(ssd0, o, q);
- ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
- rec += rstride * 2;
- org += ostride * 2;
- }
+ for (y = 0; y < 8; y += 2) {
+ v128 a, b, c, d, e, f, o, r;
+ read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
+ &a, &b, &c, &d, &e, &f);
+ ssd0 = v128_ssd_u8(ssd0, o, r);
+ ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
+ rec += rstride * 2;
+ org += ostride * 2;
}
*sum0 += v128_ssd_u8_sum(ssd0);
*sum1 += v128_ssd_u8_sum(ssd1);
}
-// Test multiple filter strengths at once. Use a simpler filter (4 tap, every
-// second line).
+SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
+ v128 d, v128 e, v128 f, ssd128_internal *ssd1,
+ ssd128_internal *ssd2,
+ ssd128_internal *ssd3) {
+ calc_diff(r, &a, &b, &c, &d, &e, &f);
+ *ssd1 = v128_ssd_u8(*ssd1, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(1),
+ v128_dup_8(-1)));
+ *ssd2 = v128_ssd_u8(*ssd2, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(2),
+ v128_dup_8(-2)));
+ *ssd3 = v128_ssd_u8(*ssd3, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(4),
+ v128_dup_8(-4)));
+}
+
+// Test multiple filter strengths at once.
void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
- int width, int height, int *sum) {
- const v128 c128 = v128_dup_8(128);
- const v128 cp1 = v128_dup_8(1);
- const v128 cm1 = v128_dup_8(-1);
- const v128 cp2 = v128_dup_8(2);
- const v128 cm2 = v128_dup_8(-2);
- const v128 cp4 = v128_dup_8(4);
- const v128 cm4 = v128_dup_8(-4);
- const v128 c8 = v128_dup_8(8);
+ int width, int height, int *sum,
+ int size) {
const int bottom = height - 2 - y0;
+ const int right = width - 8 - x0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
ssd128_internal ssd2 = v128_ssd_u8_init();
ssd128_internal ssd3 = v128_ssd_u8_init();
+ int y;
+
+ if (size != 8) { // Fallback to plain C
+ aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
+ sum, size);
+ return;
+ }
rec += x0 + y0 * rstride;
org += x0 + y0 * ostride;
- if (!x0) { // Clip left
- const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
- v64_from_64(0x0504030201000000LL));
- const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
- v64_from_64(0x0605040302010000LL));
- int y;
-
- for (y = 0; y < 8; y += 2) {
- const v64 k1 = v64_load_aligned(org);
- const v64 k2 = v64_load_aligned(org + ostride);
- const v64 l1 = v64_load_aligned(rec);
- const v64 l2 = v64_load_aligned(rec + rstride);
- v128 o = v128_from_v64(k1, k2);
- const v128 q = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, q);
- v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
- v128 b = v128_shuffle_8(x, b_shuff);
- v128 c = v128_shuffle_8(x, c_shuff);
- v128 d = v128_add_8(c128,
- v128_from_v64(v64_load_unaligned(rec + 1),
- v64_load_unaligned(rec + 1 + rstride)));
- v128 e = v128_add_8(c128,
- v128_from_v64(v64_load_unaligned(rec + 2),
- v64_load_unaligned(rec + 2 + rstride)));
- v128 f = v128_add_8(
- c128, v128_from_v64(
- l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
- v128 tmp, delta1, delta2, delta3;
-
- a = v128_ssub_s8(a, x);
- b = v128_ssub_s8(b, x);
- c = v128_ssub_s8(c, x);
- d = v128_ssub_s8(d, x);
- e = v128_ssub_s8(e, x);
- f = v128_ssub_s8(f, x);
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
- v128_max_s8(v128_min_s8(d, cp1), cm1));
- delta1 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
- v128_max_s8(v128_min_s8(f, cp1), cm1)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
- v128_max_s8(v128_min_s8(e, cp1), cm1))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
- v128_max_s8(v128_min_s8(d, cp2), cm2));
- delta2 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
- v128_max_s8(v128_min_s8(f, cp2), cm2)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
- v128_max_s8(v128_min_s8(e, cp2), cm2))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
- v128_max_s8(v128_min_s8(d, cp4), cm4));
- delta3 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
- v128_max_s8(v128_min_s8(f, cp4), cm4)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
- v128_max_s8(v128_min_s8(e, cp4), cm4))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
-
- ssd0 = v128_ssd_u8(ssd0, o, q);
- ssd1 = v128_ssd_u8(
- ssd1, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
- delta1, v128_zero()))),
- 4)));
- ssd2 = v128_ssd_u8(
- ssd2, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
- delta2, v128_zero()))),
- 4)));
- ssd3 = v128_ssd_u8(
- ssd3, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
- delta3, v128_zero()))),
- 4)));
- rec += 2 * rstride;
- org += 2 * ostride;
- }
- } else if (!(width - x0 - 8)) { // Clip right
- const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
- v64_from_64(0x0707060504030201LL));
- const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
- v64_from_64(0x0707070605040302LL));
- int y;
-
- for (y = 0; y < 8; y += 2) {
- const v64 k1 = v64_load_aligned(org);
- const v64 k2 = v64_load_aligned(org + ostride);
- const v64 l1 = v64_load_aligned(rec);
- const v64 l2 = v64_load_aligned(rec + rstride);
- v128 o = v128_from_v64(k1, k2);
- const v128 q = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, q);
- v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
- v128 b = v128_add_8(c128,
- v128_from_v64(v64_load_unaligned(rec - 2),
- v64_load_unaligned(rec - 2 + rstride)));
- v128 c = v128_add_8(c128,
- v128_from_v64(v64_load_unaligned(rec - 1),
- v64_load_unaligned(rec - 1 + rstride)));
- v128 d = v128_shuffle_8(x, d_shuff);
- v128 e = v128_shuffle_8(x, e_shuff);
- v128 f = v128_add_8(
- c128, v128_from_v64(
- l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
- v128 tmp, delta1, delta2, delta3;
-
- a = v128_ssub_s8(a, x);
- b = v128_ssub_s8(b, x);
- c = v128_ssub_s8(c, x);
- d = v128_ssub_s8(d, x);
- e = v128_ssub_s8(e, x);
- f = v128_ssub_s8(f, x);
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
- v128_max_s8(v128_min_s8(d, cp1), cm1));
- delta1 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
- v128_max_s8(v128_min_s8(f, cp1), cm1)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
- v128_max_s8(v128_min_s8(e, cp1), cm1))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
- v128_max_s8(v128_min_s8(d, cp2), cm2));
- delta2 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
- v128_max_s8(v128_min_s8(f, cp2), cm2)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
- v128_max_s8(v128_min_s8(e, cp2), cm2))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
- v128_max_s8(v128_min_s8(d, cp4), cm4));
- delta3 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
- v128_max_s8(v128_min_s8(f, cp4), cm4)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
- v128_max_s8(v128_min_s8(e, cp4), cm4))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
-
- ssd0 = v128_ssd_u8(ssd0, o, q);
- ssd1 = v128_ssd_u8(
- ssd1, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
- delta1, v128_zero()))),
- 4)));
- ssd2 = v128_ssd_u8(
- ssd2, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
- delta2, v128_zero()))),
- 4)));
- ssd3 = v128_ssd_u8(
- ssd3, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
- delta3, v128_zero()))),
- 4)));
- rec += 2 * rstride;
- org += 2 * ostride;
- }
- } else { // No left/right clipping
- int y;
- for (y = 0; y < 8; y += 2) {
- const v64 k1 = v64_load_aligned(org);
- const v64 k2 = v64_load_aligned(org + ostride);
- const v64 l1 = v64_load_aligned(rec);
- const v64 l2 = v64_load_aligned(rec + rstride);
- v128 o = v128_from_v64(k1, k2);
- const v128 q = v128_from_v64(l1, l2);
- const v128 x = v128_add_8(c128, q);
- v128 a = v128_add_8(
- c128,
- v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
- v128 b = v128_add_8(c128,
- v128_from_v64(v64_load_unaligned(rec - 2),
- v64_load_unaligned(rec - 2 + rstride)));
- v128 c = v128_add_8(c128,
- v128_from_v64(v64_load_unaligned(rec - 1),
- v64_load_unaligned(rec - 1 + rstride)));
- v128 d = v128_add_8(c128,
- v128_from_v64(v64_load_unaligned(rec + 1),
- v64_load_unaligned(rec + 1 + rstride)));
- v128 e = v128_add_8(c128,
- v128_from_v64(v64_load_unaligned(rec + 2),
- v64_load_unaligned(rec + 2 + rstride)));
- v128 f = v128_add_8(
- c128, v128_from_v64(
- l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
- v128 tmp, delta1, delta2, delta3;
-
- a = v128_ssub_s8(a, x);
- b = v128_ssub_s8(b, x);
- c = v128_ssub_s8(c, x);
- d = v128_ssub_s8(d, x);
- e = v128_ssub_s8(e, x);
- f = v128_ssub_s8(f, x);
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
- v128_max_s8(v128_min_s8(d, cp1), cm1));
- delta1 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
- v128_max_s8(v128_min_s8(f, cp1), cm1)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
- v128_max_s8(v128_min_s8(e, cp1), cm1))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
- v128_max_s8(v128_min_s8(d, cp2), cm2));
- delta2 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
- v128_max_s8(v128_min_s8(f, cp2), cm2)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
- v128_max_s8(v128_min_s8(e, cp2), cm2))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
- tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
- v128_max_s8(v128_min_s8(d, cp4), cm4));
- delta3 = v128_add_8(
- v128_add_8(
- v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
- v128_max_s8(v128_min_s8(f, cp4), cm4)),
- 2),
- v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
- v128_max_s8(v128_min_s8(e, cp4), cm4))),
- v128_add_8(v128_add_8(tmp, tmp), tmp));
-
- ssd0 = v128_ssd_u8(ssd0, o, q);
- ssd1 = v128_ssd_u8(
- ssd1, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
- delta1, v128_zero()))),
- 4)));
- ssd2 = v128_ssd_u8(
- ssd2, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
- delta2, v128_zero()))),
- 4)));
- ssd3 = v128_ssd_u8(
- ssd3, o,
- v128_add_8(
- q,
- v128_shr_s8(
- v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
- delta3, v128_zero()))),
- 4)));
- rec += 2 * rstride;
- org += 2 * ostride;
- }
+ for (y = 0; y < 8; y += 2) {
+ v128 a, b, c, d, e, f, o, r;
+ read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
+ &a, &b, &c, &d, &e, &f);
+ ssd0 = v128_ssd_u8(ssd0, o, r);
+ calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
+ rec += 2 * rstride;
+ org += 2 * ostride;
}
sum[0] += v128_ssd_u8_sum(ssd0);
sum[1] += v128_ssd_u8_sum(ssd1);
sum[2] += v128_ssd_u8_sum(ssd2);
sum[3] += v128_ssd_u8_sum(ssd3);
}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
+ int rstride, int ostride, int x0, int y0,
+ int bottom, int right, int y, v128 *o,
+ v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
+ v128 *e, v128 *f, int shift) {
+ const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
+ const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
+ *o = v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
+ v128_shr_u16(v128_load_aligned(org + ostride), shift));
+ *r = v128_unziplo_8(n1, n2);
+ *a = v128_unziplo_8(
+ v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift), n1);
+ *f = v128_unziplo_8(
+ n2, v128_shr_u16(v128_load_unaligned(rec + ((y != bottom) + 1) * rstride),
+ shift));
+ *b = v128_unziplo_8(
+ v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
+ v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
+ *c = v128_unziplo_8(
+ v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
+ v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
+ *d = v128_unziplo_8(
+ v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
+ v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
+ *e = v128_unziplo_8(
+ v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
+ v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
+ clip_sides(b, c, d, e, x0, right);
+}
+
+void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
+ int rstride, int ostride, int x0, int y0,
+ int width, int height, int *sum0, int *sum1,
+ unsigned int strength, int shift,
+ int size) {
+ const v128 sp = v128_dup_8(strength >> shift);
+ const v128 sm = v128_dup_8(-(int)(strength >> shift));
+ const int bottom = height - 2 - y0;
+ const int right = width - 8 - x0;
+ ssd128_internal ssd0 = v128_ssd_u8_init();
+ ssd128_internal ssd1 = v128_ssd_u8_init();
+ int y;
+
+ if (size != 8) { // Fallback to plain C
+ aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
+ sum0, sum1, strength, shift, size);
+ return;
+ }
+
+ rec += x0 + y0 * rstride;
+ org += x0 + y0 * ostride;
+
+ for (y = 0; y < 8; y += 2) {
+ v128 a, b, c, d, e, f, o, r;
+ read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
+ &r, &a, &b, &c, &d, &e, &f, shift);
+ ssd0 = v128_ssd_u8(ssd0, o, r);
+ ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
+ rec += rstride * 2;
+ org += ostride * 2;
+ }
+ *sum0 += v128_ssd_u8_sum(ssd0);
+ *sum1 += v128_ssd_u8_sum(ssd1);
+}
+
+void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
+ const uint16_t *org, int rstride,
+ int ostride, int x0, int y0,
+ int width, int height, int *sum,
+ int shift, int size) {
+ const int bottom = height - 2 - y0;
+ const int right = width - 8 - x0;
+ ssd128_internal ssd0 = v128_ssd_u8_init();
+ ssd128_internal ssd1 = v128_ssd_u8_init();
+ ssd128_internal ssd2 = v128_ssd_u8_init();
+ ssd128_internal ssd3 = v128_ssd_u8_init();
+ int y;
+
+ if (size != 8) { // Fallback to plain C
+ aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
+ height, sum, shift, size);
+ return;
+ }
+
+ rec += x0 + y0 * rstride;
+ org += x0 + y0 * ostride;
+
+ for (y = 0; y < 8; y += 2) {
+ v128 a, b, c, d, e, f, o, r;
+ read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
+ &r, &a, &b, &c, &d, &e, &f, shift);
+ ssd0 = v128_ssd_u8(ssd0, o, r);
+ calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
+ rec += 2 * rstride;
+ org += 2 * ostride;
+ }
+ sum[0] += v128_ssd_u8_sum(ssd0);
+ sum[1] += v128_ssd_u8_sum(ssd1);
+ sum[2] += v128_ssd_u8_sum(ssd2);
+ sum[3] += v128_ssd_u8_sum(ssd3);
+}
+#endif
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 87e7d51..2eecee4 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -815,7 +815,9 @@
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+#if CONFIG_DUAL_FILTER
int i;
+#endif
const uint8_t *src;
const uint8_t *ref;
int src_stride;
@@ -859,7 +861,6 @@
if (!is_key_frame) {
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- unsigned int uv_sad;
const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
unsigned int y_sad, y_sad_g;
@@ -916,20 +917,6 @@
av1_build_inter_predictors_sb(xd, mi_row, mi_col, cm->sb_size);
- for (i = 1; i < MAX_MB_PLANE; ++i) {
- struct macroblock_plane *p = &x->plane[i];
- struct macroblockd_plane *pd = &xd->plane[i];
- const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-
- if (bs == BLOCK_INVALID)
- uv_sad = UINT_MAX;
- else
- uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
- pd->dst.stride);
-
- x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
- }
-
ref = xd->plane[0].dst.buf;
ref_stride = xd->plane[0].dst.stride;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 69a7cf8..983f8cc 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -16,6 +16,7 @@
#include "av1/common/alloccommon.h"
#if CONFIG_CLPF
+#include "aom/aom_image.h"
#include "av1/common/clpf.h"
#include "av1/encoder/clpf_rdo.h"
#endif
@@ -3423,7 +3424,7 @@
#endif
}
#if CONFIG_CLPF
- cm->clpf_strength = 0;
+ cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
cm->clpf_size = 2;
CHECK_MEM_ERROR(
cm, cm->clpf_blocks,
@@ -3431,35 +3432,33 @@
((cm->frame_to_show->y_crop_height + 31) & ~31) >>
10));
if (!is_lossless_requested(&cpi->oxcf)) {
- // Test CLPF
- int i, hq = 1;
- // TODO(yaowu): investigate per-segment CLPF decision and
- // an optimal threshold, use 80 for now.
- for (i = 0; i < MAX_SEGMENTS; i++)
- hq &= av1_get_qindex(&cm->seg, i, cm->base_qindex) < 80;
+ const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
- // Don't try filter if the entire image is nearly losslessly encoded
- if (!hq) {
- // Find the best strength and block size for the entire frame
- int fb_size_log2, strength;
- av1_clpf_test_frame(&cpi->last_frame_uf, cpi->Source, cm, &strength,
- &fb_size_log2);
+ // Find the best strength and block size for the entire frame
+ int fb_size_log2, strength_y, strength_u, strength_v;
+ av1_clpf_test_frame(frame, cpi->Source, cm, &strength_y, &fb_size_log2,
+ AOM_PLANE_Y);
+ av1_clpf_test_frame(frame, cpi->Source, cm, &strength_u, 0, AOM_PLANE_U);
+ av1_clpf_test_frame(frame, cpi->Source, cm, &strength_v, 0, AOM_PLANE_V);
- if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE);
-
- if (!strength) { // Better to disable for the whole frame?
- cm->clpf_strength = 0;
- } else {
- // Apply the filter using the chosen strength
- cm->clpf_strength = strength - (strength == 4);
- cm->clpf_size =
- fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
- aom_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf);
- cm->clpf_numblocks =
- av1_clpf_frame(cm->frame_to_show, &cpi->last_frame_uf, cpi->Source,
- cm, !!cm->clpf_size, strength, 4 + cm->clpf_size,
- cm->clpf_blocks, av1_clpf_decision);
- }
+ if (strength_y) {
+ // Apply the filter using the chosen strength
+ cm->clpf_strength_y = strength_y - (strength_y == 4);
+ cm->clpf_size =
+ fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
+ cm->clpf_numblocks = av1_clpf_frame(
+ frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
+ 4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
+ }
+ if (strength_u) {
+ cm->clpf_strength_u = strength_u - (strength_u == 4);
+ av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
+ NULL);
+ }
+ if (strength_v) {
+ cm->clpf_strength_v = strength_v - (strength_v == 4);
+ av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
+ NULL);
}
}
#endif
diff --git a/av1/encoder/treewriter.h b/av1/encoder/treewriter.h
index 533e7d9..9a4cb86 100644
--- a/av1/encoder/treewriter.h
+++ b/av1/encoder/treewriter.h
@@ -29,20 +29,10 @@
void av1_tokens_from_tree(struct av1_token *, const aom_tree_index *);
-static INLINE void av1_write_tree(aom_writer *w, const aom_tree_index *tree,
- const aom_prob *probs, int bits, int len,
- aom_tree_index i) {
- do {
- const int bit = (bits >> --len) & 1;
- aom_write(w, bit, probs[i >> 1]);
- i = tree[i + bit];
- } while (len);
-}
-
static INLINE void av1_write_token(aom_writer *w, const aom_tree_index *tree,
const aom_prob *probs,
const struct av1_token *token) {
- av1_write_tree(w, tree, probs, token->value, token->len, 0);
+ aom_write_tree(w, tree, probs, token->value, token->len, 0);
}
#ifdef __cplusplus
diff --git a/configure b/configure
index c96691ba..1bc0863 100755
--- a/configure
+++ b/configure
@@ -606,12 +606,7 @@
check_add_cflags -Wuninitialized
check_add_cflags -Wunused-variable
case ${CC} in
- *clang*)
- # libaom and/or clang have issues with aliasing:
- # https://code.google.com/p/webm/issues/detail?id=603
- # work around them until they are fixed
- check_add_cflags -fno-strict-aliasing
- ;;
+ *clang*) ;;
*) check_add_cflags -Wunused-but-set-variable ;;
esac
if enabled mips || [ -z "${INLINE}" ]; then
diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index 7676226..069e35e 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c
@@ -1,11 +1,12 @@
/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <stdio.h>
diff --git a/test/clpf_test.cc b/test/clpf_test.cc
index 786180b..24d7bb3 100644
--- a/test/clpf_test.cc
+++ b/test/clpf_test.cc
@@ -26,9 +26,9 @@
namespace {
-typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride,
- int x0, int y0, int sizex, int sizey, int width,
- int height, unsigned int strength);
+typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
+ int dstride, int x0, int y0, int sizex, int sizey,
+ int width, int height, unsigned int strength);
typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
clpf_block_param_t;
@@ -54,44 +54,84 @@
typedef ClpfBlockTest ClpfSpeedTest;
-TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
- int w = sizex;
- int h = sizey;
- const int size = 32;
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, uint8_t, s[size * size]);
- DECLARE_ALIGNED(16, uint8_t, d[size * size]);
- DECLARE_ALIGNED(16, uint8_t, ref_d[size * size]);
- memset(ref_d, 0, size * size);
- memset(d, 0, size * size);
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*clpf_block_hbd_t)(const uint16_t *src, uint16_t *dst,
+ int sstride, int dstride, int x0, int y0,
+ int sizex, int sizey, int width, int height,
+ unsigned int strength);
- int error = 0;
- int pos = 0;
- int strength = 0;
- int xpos = 0, ypos = 0;
- int bits;
- int level;
+typedef std::tr1::tuple<clpf_block_hbd_t, clpf_block_hbd_t, int, int>
+ clpf_block_hbd_param_t;
+
+class ClpfBlockHbdTest
+ : public ::testing::TestWithParam<clpf_block_hbd_param_t> {
+ public:
+ virtual ~ClpfBlockHbdTest() {}
+ virtual void SetUp() {
+ clpf = GET_PARAM(0);
+ ref_clpf = GET_PARAM(1);
+ sizex = GET_PARAM(2);
+ sizey = GET_PARAM(3);
+ }
+
+ virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+ int sizex;
+ int sizey;
+ clpf_block_hbd_t clpf;
+ clpf_block_hbd_t ref_clpf;
+};
+
+typedef ClpfBlockHbdTest ClpfHbdSpeedTest;
+#endif
+
+template <typename pixel>
+void test_clpf(int w, int h, int depth, int iterations,
+ void (*clpf)(const pixel *src, pixel *dst, int sstride,
+ int dstride, int x0, int y0, int sizex, int sizey,
+ int width, int height, unsigned int strength),
+ void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
+ int dstride, int x0, int y0, int sizex,
+ int sizey, int width, int height,
+ unsigned int strength)) {
+ const int size = 24;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, pixel, s[size * size]);
+ DECLARE_ALIGNED(16, pixel, d[size * size]);
+ DECLARE_ALIGNED(16, pixel, ref_d[size * size]);
+ memset(ref_d, 0, size * size * sizeof(*ref_d));
+ memset(d, 0, size * size * sizeof(*d));
+
+ int error = 0, pos = 0, strength = 0, xpos = 0, ypos = 0;
+ int bits, level, count;
// Test every combination of:
- // * Input with 1-8 bits of noise
- // * Noise level around every value from 0 to 255
+ // * Input with up to <depth> bits of noise
+ // * Noise level around every value from 0 to (1<<depth)-1
// * Blocks anywhere in the frame (along all egdes and also fully inside)
// * All strengths
- for (level = 0; level < 256 && !error; level++) {
- for (bits = 1; bits < 9 && !error; bits++) {
- for (int i = 0; i < size * size; i++)
- s[i] = clamp((rnd.Rand8() & ((1 << bits) - 1)) + level, 0, 255);
+ // If clpf and ref_clpf are the same, we're just testing speed
+ for (count = 0; count < iterations; count++) {
+ for (level = 0; level < (1 << depth) && !error; level++) {
+ for (bits = 1; bits <= depth && !error; bits++) {
+ for (int i = 0; i < size * size; i++)
+ s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+ (1 << depth) - 1);
- for (ypos = 0; ypos < size && !error; ypos += h * !error) {
- for (xpos = 0; xpos < size && !error; xpos += w * !error) {
- for (strength = 0; strength < 3 && !error; strength += !error) {
- ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size,
- 1 << strength);
- ASM_REGISTER_STATE_CHECK(
- clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength));
-
- for (pos = 0; pos < size * size && !error; pos++) {
- error = ref_d[pos] != d[pos];
+ for (ypos = 0; ypos < size && !error; ypos += h * !error) {
+ for (xpos = 0; xpos < size && !error; xpos += w * !error) {
+ for (strength = depth - 8; strength < depth - 5 && !error;
+ strength += !error) {
+ ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
+ 1 << strength);
+ if (clpf != ref_clpf)
+ ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w,
+ h, size, size, 1 << strength));
+ if (ref_clpf != clpf)
+ for (pos = 0; pos < size * size && !error; pos++) {
+ error = ref_d[pos] != d[pos];
+ }
}
}
}
@@ -99,6 +139,7 @@
}
}
+ pos--;
EXPECT_EQ(0, error)
<< "Error: ClpfBlockTest, SIMD and C mismatch." << std::endl
<< "First error at " << pos % size << "," << pos / size << " ("
@@ -106,6 +147,8 @@
<< "strength: " << (1 << strength) << std::endl
<< "xpos: " << xpos << std::endl
<< "ypos: " << ypos << std::endl
+ << "w: " << w << std::endl
+ << "h: " << h << std::endl
<< "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
<< "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
<< "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl
@@ -116,45 +159,26 @@
<< std::endl;
}
-TEST_P(ClpfSpeedTest, TestSpeed) {
- int w = sizex;
- int h = sizey;
- const int size = 32;
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, uint8_t, s[size * size]);
- DECLARE_ALIGNED(16, uint8_t, d[size * size]);
-
- int strength;
- int xpos, ypos;
-
- for (int i = 0; i < size * size; i++) s[i] = rnd.Rand8();
-
+template <typename pixel>
+void test_clpf_speed(int w, int h, int depth, int iterations,
+ void (*clpf)(const pixel *src, pixel *dst, int sstride,
+ int dstride, int x0, int y0, int sizex,
+ int sizey, int width, int height,
+ unsigned int strength),
+ void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
+ int dstride, int x0, int y0, int sizex,
+ int sizey, int width, int height,
+ unsigned int strength)) {
aom_usec_timer ref_timer;
aom_usec_timer timer;
aom_usec_timer_start(&ref_timer);
- for (int c = 0; c < 65536; c++) {
- for (ypos = 0; ypos < size; ypos += h) {
- for (xpos = 0; xpos < size; xpos += w) {
- for (strength = 0; strength < 3; strength++) {
- ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
- }
- }
- }
- }
+ test_clpf(w, h, depth, iterations, ref_clpf, ref_clpf);
aom_usec_timer_mark(&ref_timer);
int ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer);
aom_usec_timer_start(&timer);
- for (int c = 0; c < 65536; c++) {
- for (ypos = 0; ypos < size; ypos += h) {
- for (xpos = 0; xpos < size; xpos += w) {
- for (strength = 0; strength < 3; strength++) {
- clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
- }
- }
- }
- }
+ test_clpf(w, h, depth, iterations, clpf, clpf);
aom_usec_timer_mark(&timer);
int elapsed_time = aom_usec_timer_elapsed(&timer);
@@ -165,10 +189,28 @@
EXPECT_GT(ref_elapsed_time, elapsed_time)
<< "Error: ClpfSpeedTest, SIMD slower than C." << std::endl
- << "C time: " << ref_elapsed_time << "ms" << std::endl
- << "SIMD time: " << elapsed_time << "ms" << std::endl;
+ << "C time: " << ref_elapsed_time << " us" << std::endl
+ << "SIMD time: " << elapsed_time << " us" << std::endl;
}
+TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
+ test_clpf(sizex, sizey, 8, 1, clpf, ref_clpf);
+}
+
+TEST_P(ClpfSpeedTest, TestSpeed) {
+ test_clpf_speed(sizex, sizey, 8, 16, clpf, ref_clpf);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+TEST_P(ClpfBlockHbdTest, TestSIMDNoMismatch) {
+ test_clpf(sizex, sizey, 12, 1, clpf, ref_clpf);
+}
+
+TEST_P(ClpfHbdSpeedTest, TestSpeed) {
+ test_clpf_speed(sizex, sizey, 12, 1, clpf, ref_clpf);
+}
+#endif
+
using std::tr1::make_tuple;
// Test all supported architectures and block sizes
@@ -212,6 +254,48 @@
4)));
#endif
+#if CONFIG_AOM_HIGHBITDEPTH
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, ClpfBlockHbdTest,
+ ::testing::Values(
+ make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 8),
+ make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 4),
+ make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 8),
+ make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 4)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, ClpfBlockHbdTest,
+ ::testing::Values(
+ make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 8),
+ make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 4),
+ make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 8),
+ make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 4)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSSE4_1, ClpfBlockHbdTest,
+ ::testing::Values(
+ make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 8),
+ make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 4),
+ make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 8),
+ make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 4)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+ NEON, ClpfBlockHbdTest,
+ ::testing::Values(
+ make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 8),
+ make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 4),
+ make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 8),
+ make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 4)));
+#endif
+#endif
+
// Test speed for all supported architectures
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, ClpfSpeedTest,
@@ -236,4 +320,35 @@
::testing::Values(make_tuple(&aom_clpf_block_neon,
&aom_clpf_block_c, 8, 8)));
#endif
+
+#if CONFIG_AOM_HIGHBITDEPTH
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, ClpfHbdSpeedTest,
+ ::testing::Values(make_tuple(&aom_clpf_block_hbd_sse2,
+ &aom_clpf_block_hbd_c, 8,
+ 8)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, ClpfHbdSpeedTest,
+ ::testing::Values(make_tuple(&aom_clpf_block_hbd_ssse3,
+ &aom_clpf_block_hbd_c, 8,
+ 8)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSSE4_1, ClpfHbdSpeedTest,
+ ::testing::Values(make_tuple(&aom_clpf_block_hbd_ssse3,
+ &aom_clpf_block_hbd_c, 8,
+ 8)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, ClpfHbdSpeedTest,
+ ::testing::Values(make_tuple(&aom_clpf_block_hbd_neon,
+ &aom_clpf_block_hbd_c, 8,
+ 8)));
+#endif
+#endif
+
} // namespace
diff --git a/test/cx_set_ref.sh b/test/cx_set_ref.sh
deleted file mode 100755
index dfba40d..0000000
--- a/test/cx_set_ref.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/sh
-##
-## Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-##
-## Use of this source code is governed by a BSD-style license
-## that can be found in the LICENSE file in the root of the source
-## tree. An additional intellectual property rights grant can be found
-## in the file PATENTS. All contributing project authors may
-## be found in the AUTHORS file in the root of the source tree.
-##
-## This file tests the libaom cx_set_ref example. To add new tests to this
-## file, do the following:
-## 1. Write a shell function (this is your test).
-## 2. Add the function to cx_set_ref_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: $YUV_RAW_INPUT is required.
-cx_set_ref_verify_environment() {
- if [ ! -e "${YUV_RAW_INPUT}" ]; then
- echo "Libaom test data must exist in LIBVPX_TEST_DATA_PATH."
- return 1
- fi
-}
-
-# Runs cx_set_ref and updates the reference frame before encoding frame 90.
-# $1 is the codec name.
-aom_set_ref() {
- local codec="$1"
- local encoder="${LIBAOM_BIN_PATH}/aom_cx_set_ref${AOM_TEST_EXE_SUFFIX}"
-
- local output_file="${AOM_TEST_OUTPUT_DIR}/${codec}cx_set_ref_${codec}.ivf"
- local ref_frame_num=90
-
- if [ ! -x "${encoder}" ]; then
- elog "${encoder} does not exist or is not executable."
- return 1
- fi
-
- if [ "$codec" = "vp8" ]; then
- eval "${AOM_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT_WIDTH}" \
- "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
- "${ref_frame_num}" ${devnull}
- else
- eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
- "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
- "${ref_frame_num}" ${devnull}
- fi
-
- [ -e "${output_file}" ] || return 1
-}
-
-cx_set_ref_av1() {
- if [ "$(av1_encode_available)" = "yes" ]; then
- aom_set_ref av1 || return 1
- fi
-}
-
-cx_set_ref_tests="cx_set_ref_av1"
-
-run_tests cx_set_ref_verify_environment "${cx_set_ref_tests}"
diff --git a/test/vpxdec.sh b/test/vpxdec.sh
deleted file mode 100755
index 7c5169d..0000000
--- a/test/vpxdec.sh
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/sh
-##
-## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-##
-## Use of this source code is governed by a BSD-style license
-## that can be found in the LICENSE file in the root of the source
-## tree. An additional intellectual property rights grant can be found
-## in the file PATENTS. All contributing project authors may
-## be found in the AUTHORS file in the root of the source tree.
-##
-## This file tests aomdec. To add new tests to this file, do the following:
-## 1. Write a shell function (this is your test).
-## 2. Add the function to aomdec_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: Make sure input is available.
-aomdec_verify_environment() {
- if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${AV1_WEBM_FILE}" ] || \
- [ ! -e "${AV1_FPM_WEBM_FILE}" ] || \
- [ ! -e "${AV1_LT_50_FRAMES_WEBM_FILE}" ] ; then
- elog "Libaom test data must exist in LIBVPX_TEST_DATA_PATH."
- return 1
- fi
- if [ -z "$(aom_tool_path aomdec)" ]; then
- elog "aomdec not found. It must exist in LIBAOM_BIN_PATH or its parent."
- return 1
- fi
-}
-
-# Wrapper function for running aomdec with pipe input. Requires that
-# LIBAOM_BIN_PATH points to the directory containing aomdec. $1 is used as the
-# input file path and shifted away. All remaining parameters are passed through
-# to aomdec.
-aomdec_pipe() {
- local readonly decoder="$(aom_tool_path aomdec)"
- local readonly input="$1"
- shift
- cat "${input}" | eval "${AOM_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
-}
-
-# Wrapper function for running aomdec. Requires that LIBAOM_BIN_PATH points to
-# the directory containing aomdec. $1 one is used as the input file path and
-# shifted away. All remaining parameters are passed through to aomdec.
-aomdec() {
- local readonly decoder="$(aom_tool_path aomdec)"
- local readonly input="$1"
- shift
- eval "${AOM_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
-}
-
-aomdec_can_decode_vp8() {
- if [ "$(vp8_decode_available)" = "yes" ]; then
- echo yes
- fi
-}
-
-aomdec_can_decode_vp9() {
- if [ "$(vp9_decode_available)" = "yes" ]; then
- echo yes
- fi
-}
-
-aomdec_vp8_ivf() {
- if [ "$(aomdec_can_decode_vp8)" = "yes" ]; then
- aomdec "${VP8_IVF_FILE}" --summary --noblit
- fi
-}
-
-aomdec_vp8_ivf_pipe_input() {
- if [ "$(aomdec_can_decode_vp8)" = "yes" ]; then
- aomdec_pipe "${VP8_IVF_FILE}" --summary --noblit
- fi
-}
-
-aomdec_vp9_webm() {
- if [ "$(aomdec_can_decode_vp9)" = "yes" ] && \
- [ "$(webm_io_available)" = "yes" ]; then
- aomdec "${AV1_WEBM_FILE}" --summary --noblit
- fi
-}
-
-aomdec_vp9_webm_frame_parallel() {
- if [ "$(aomdec_can_decode_vp9)" = "yes" ] && \
- [ "$(webm_io_available)" = "yes" ]; then
- for threads in 2 3 4 5 6 7 8; do
- aomdec "${AV1_FPM_WEBM_FILE}" --summary --noblit --threads=$threads \
- --frame-parallel
- done
- fi
-}
-
-aomdec_vp9_webm_less_than_50_frames() {
- # ensure that reaching eof in webm_guess_framerate doesn't result in invalid
- # frames in actual webm_read_frame calls.
- if [ "$(aomdec_can_decode_vp9)" = "yes" ] && \
- [ "$(webm_io_available)" = "yes" ]; then
- local readonly decoder="$(aom_tool_path aomdec)"
- local readonly expected=10
- local readonly num_frames=$(${AOM_TEST_PREFIX} "${decoder}" \
- "${AV1_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
- | awk '/^[0-9]+ decoded frames/ { print $1 }')
- if [ "$num_frames" -ne "$expected" ]; then
- elog "Output frames ($num_frames) != expected ($expected)"
- return 1
- fi
- fi
-}
-
-aomdec_tests="aomdec_vp8_ivf
- aomdec_vp8_ivf_pipe_input
- aomdec_vp9_webm
- aomdec_vp9_webm_frame_parallel
- aomdec_vp9_webm_less_than_50_frames"
-
-run_tests aomdec_verify_environment "${aomdec_tests}"
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index c996655..c4755f7 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -139,7 +139,7 @@
class Y4mVideoWriteTest : public Y4mVideoSourceTest {
protected:
- Y4mVideoWriteTest() {}
+ Y4mVideoWriteTest() : tmpfile_(NULL) {}
virtual ~Y4mVideoWriteTest() {
delete tmpfile_;