Merge "Refactor tx_size to pixel number in decodeframe.c" into nextgenv2
diff --git a/aom_dsp/answriter.h b/aom_dsp/answriter.h
index 298b255..370472a 100644
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h
@@ -20,8 +20,23 @@
#include "aom_dsp/ans.h"
#include "aom_dsp/prob.h"
#include "aom_ports/mem_ops.h"
+#include "av1/common/odintrin.h"
-#define ANS_DIV(dividend, divisor) ((dividend) / (divisor))
+#if RANS_PRECISION <= OD_DIVU_DMAX
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+ do { \
+ quotient = OD_DIVU_SMALL((dividend), (divisor)); \
+ remainder = (dividend) - (quotient) * (divisor); \
+ } while (0)
+#else
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+ do { \
+ quotient = (dividend) / (divisor); \
+ remainder = (dividend) % (divisor); \
+ } while (0)
+#endif
+
+#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
#ifdef __cplusplus
extern "C" {
@@ -72,9 +87,9 @@
ans->state /= IO_BASE;
}
if (!val)
- ans->state = ANS_DIV(ans->state * ANS_P8_PRECISION, p0);
+ ans->state = ANS_DIV8(ans->state * ANS_P8_PRECISION, p0);
else
- ans->state = ANS_DIV((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
+ ans->state = ANS_DIV8((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
}
struct rans_sym {
@@ -88,15 +103,17 @@
static INLINE void rans_write(struct AnsCoder *ans,
const struct rans_sym *const sym) {
const aom_cdf_prob p = sym->prob;
+ unsigned quot, rem;
while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
ans->state /= IO_BASE;
}
- ans->state =
- (ans->state / p) * RANS_PRECISION + ans->state % p + sym->cum_prob;
+ ANS_DIVREM(quot, rem, ans->state, p);
+ ans->state = quot * RANS_PRECISION + rem + sym->cum_prob;
}
-#undef ANS_DIV
+#undef ANS_DIV8
+#undef ANS_DIVREM
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index c74bfe3..28e7f12 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -394,9 +394,13 @@
DSP_SRCS-yes += simd/v64_intrinsics_c.h
DSP_SRCS-yes += simd/v128_intrinsics.h
DSP_SRCS-yes += simd/v128_intrinsics_c.h
+DSP_SRCS-yes += simd/v256_intrinsics.h
+DSP_SRCS-yes += simd/v256_intrinsics_c.h
DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h
DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h
+DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h
DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h
DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h
+DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h
$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
diff --git a/aom_dsp/aom_simd.h b/aom_dsp/aom_simd.h
index 3879d95..ae4ff23 100644
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -22,11 +22,11 @@
#include "./aom_simd_inline.h"
#if HAVE_NEON
-#include "simd/v128_intrinsics_arm.h"
+#include "simd/v256_intrinsics_arm.h"
#elif HAVE_SSE2
-#include "simd/v128_intrinsics_x86.h"
+#include "simd/v256_intrinsics_x86.h"
#else
-#include "simd/v128_intrinsics.h"
+#include "simd/v256_intrinsics.h"
#endif
#endif // AOM_DSP_AOM_AOM_SIMD_H_
diff --git a/aom_dsp/arm/aom_convolve8_avg_neon.c b/aom_dsp/arm/aom_convolve8_avg_neon.c
index 7dc936d..09429d6 100644
--- a/aom_dsp/arm/aom_convolve8_avg_neon.c
+++ b/aom_dsp/arm/aom_convolve8_avg_neon.c
@@ -65,6 +65,10 @@
assert(x_step_q4 == 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)filter_y;
+
q0s16 = vld1q_s16(filter_x);
src -= 3; // adjust for taps
@@ -241,6 +245,10 @@
assert(y_step_q4 == 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)filter_x;
+
src -= src_stride * 3;
q0s16 = vld1q_s16(filter_y);
for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index ed0df6d..8ebffb5 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -65,6 +65,10 @@
assert(x_step_q4 == 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)filter_y;
+
q0s16 = vld1q_s16(filter_x);
src -= 3; // adjust for taps
@@ -225,6 +229,10 @@
assert(y_step_q4 == 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)filter_x;
+
src -= src_stride * 3;
q0s16 = vld1q_s16(filter_y);
for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
diff --git a/aom_dsp/simd/v256_intrinsics.h b/aom_dsp/simd/v256_intrinsics.h
new file mode 100644
index 0000000..73bcd94
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./v256_intrinsics_c.h"
+#include "./v128_intrinsics.h"
+#include "./v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v256 v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
+SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+ return c_v256_from_v128(hi, lo);
+}
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return c_v256_from_64(a, b, c, d);
+}
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return c_v256_from_v64(a, b, c, d);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return c_v256_load_unaligned(p);
+}
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return c_v256_load_aligned(p);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ c_v256_store_unaligned(p, a);
+}
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ c_v256_store_aligned(p, a);
+}
+
+SIMD_INLINE v256 v256_align(v256 a, v256 b, const unsigned int c) {
+ return c_v256_align(a, b, c);
+}
+
+SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+
+typedef uint32_t sad256_internal;
+SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+ return c_v256_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+ return c_v256_sad_u8_sum(s);
+}
+typedef uint32_t ssd256_internal;
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+ return c_v256_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+ return c_v256_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ return c_v256_dotp_s16(a, b);
+}
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return c_v256_mullo_s16(a, b);
+}
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return c_v256_mulhi_s16(a, b);
+}
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return c_v256_mullo_s32(a, b);
+}
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return c_v256_ziplo_128(a, b);
+}
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return c_v256_ziphi_128(a, b);
+}
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return c_v256_unziplo_8(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return c_v256_unziphi_8(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return c_v256_unziplo_16(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return c_v256_unziphi_16(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return c_v256_unziplo_32(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return c_v256_unziphi_32(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return c_v256_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return c_v256_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return c_v256_pack_s32_s16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return c_v256_pack_s16_u8(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return c_v256_pack_s16_s8(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return c_v256_unpack_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return c_v256_unpack_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return c_v256_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return c_v256_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return c_v256_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return c_v256_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+ return c_v256_shuffle_8(a, pattern);
+}
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return c_v256_pshuffle_8(a, pattern);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return c_v256_cmpgt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return c_v256_cmplt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+ return c_v256_shl_8(a, c);
+}
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+ return c_v256_shr_u8(a, c);
+}
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+ return c_v256_shr_s8(a, c);
+}
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+ return c_v256_shl_16(a, c);
+}
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+ return c_v256_shr_u16(a, c);
+}
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+ return c_v256_shr_s16(a, c);
+}
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+ return c_v256_shl_32(a, c);
+}
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+ return c_v256_shr_u32(a, c);
+}
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+ return c_v256_shr_s32(a, c);
+}
+
+SIMD_INLINE v256 v256_shr_n_byte(v256 a, const unsigned int n) {
+ return c_v256_shr_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_byte(v256 a, const unsigned int n) {
+ return c_v256_shl_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_8(v256 a, const unsigned int n) {
+ return c_v256_shl_n_8(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_16(v256 a, const unsigned int n) {
+ return c_v256_shl_n_16(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_32(v256 a, const unsigned int n) {
+ return c_v256_shl_n_32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u8(v256 a, const unsigned int n) {
+ return c_v256_shr_n_u8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u16(v256 a, const unsigned int n) {
+ return c_v256_shr_n_u16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u32(v256 a, const unsigned int n) {
+ return c_v256_shr_n_u32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s8(v256 a, const unsigned int n) {
+ return c_v256_shr_n_s8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s16(v256 a, const unsigned int n) {
+ return c_v256_shr_n_s16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s32(v256 a, const unsigned int n) {
+ return c_v256_shr_n_s32(a, n);
+}
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/aom_dsp/simd/v256_intrinsics_arm.h b/aom_dsp/simd/v256_intrinsics_arm.h
new file mode 100644
index 0000000..ba4ed71
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics_arm.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#include "./v256_intrinsics_v128.h"
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
new file mode 100644
index 0000000..8a67f9e
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_C_H
+#define _V256_INTRINSICS_C_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./v128_intrinsics_c.h"
+#include "./aom_config.h"
+
+typedef union {
+ uint8_t u8[32];
+ uint16_t u16[16];
+ uint32_t u32[8];
+ uint64_t u64[4];
+ int8_t s8[32];
+ int16_t s16[16];
+ int32_t s32[8];
+ int64_t s64[4];
+ c_v64 v64[4];
+ c_v128 v128[2];
+} c_v256;
+
+SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
+
+SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
+
+SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
+
+SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
+ c_v256 t;
+ t.v128[1] = hi;
+ t.v128[0] = lo;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
+ uint64_t d) {
+ c_v256 t;
+ t.u64[3] = a;
+ t.u64[2] = b;
+ t.u64[1] = c;
+ t.u64[0] = d;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
+ c_v256 t;
+ t.u64[3] = a.u64;
+ t.u64[2] = b.u64;
+ t.u64[1] = c.u64;
+ t.u64[0] = d.u64;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
+ c_v256 t;
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&t;
+ int c;
+ for (c = 0; c < 32; c++) q[c] = pp[c];
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
+ if (simd_check && (uintptr_t)p & 31) {
+ fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
+ abort();
+ }
+ return c_v256_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&a;
+ int c;
+ for (c = 0; c < 32; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
+ if (simd_check && (uintptr_t)p & 31) {
+ fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
+ abort();
+ }
+ c_v256_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v256 c_v256_zero() {
+ c_v256 t;
+ t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+ return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
+ return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
+ c_v128_dotp_s16(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
+ return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
+}
+
+typedef uint32_t c_sad256_internal;
+
+SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u8_sum().
+ The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+ c_v256 b) {
+ int c;
+ for (c = 0; c < 32; c++)
+ s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
+
+typedef uint32_t c_ssd256_internal;
+
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
+ c_v256 b) {
+ int c;
+ for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
+
+SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
+ c_v128_or(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
+ c_v128_xor(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
+ c_v128_and(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
+ c_v128_andn(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
+ c_v128_add_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
+ c_v128_add_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
+ c_v128_sadd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
+ c_v128_add_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
+ c_v256 t;
+ t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+ t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+ t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+ t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+ t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
+ t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
+ t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
+ t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
+ c_v128_sub_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
+ c_v128_ssub_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
+ c_v128_ssub_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
+ c_v128_sub_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
+ c_v128_ssub_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
+ c_v128_sub_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
+ c_v128 lo_bits = c_v128_mullo_s16(a, b);
+ c_v128 hi_bits = c_v128_mulhi_s16(a, b);
+ return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
+ c_v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
+ c_v128_mullo_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
+ c_v128_mulhi_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
+ c_v128_mullo_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
+ c_v128_madd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
+ c_v128_madd_us8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
+ c_v128_avg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
+ c_v128_rdavg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
+ c_v128_avg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
+ c_v128_min_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
+ c_v128_max_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
+ c_v128_min_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
+ c_v128_max_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
+ c_v128_min_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
+ c_v128_max_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
+ c_v128_ziplo_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
+ c_v128_ziplo_8(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
+ c_v128_ziplo_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
+ c_v128_ziplo_16(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
+ c_v128_ziplo_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
+ c_v128_ziplo_32(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
+ c_v128_ziplo_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
+ c_v128_ziplo_64(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(a.v128[1], b.v128[1]);
+}
+
+SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ int i;
+ if (mode) {
+ for (i = 0; i < 16; i++) {
+ t.u8[i] = a.u8[i * 2 + 1];
+ t.u8[i + 16] = b.u8[i * 2 + 1];
+ }
+ } else {
+ for (i = 0; i < 16; i++) {
+ t.u8[i] = b.u8[i * 2];
+ t.u8[i + 16] = a.u8[i * 2];
+ }
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
+ : _c_v256_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
+ : _c_v256_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ int i;
+ if (mode) {
+ for (i = 0; i < 8; i++) {
+ t.u16[i] = a.u16[i * 2 + 1];
+ t.u16[i + 8] = b.u16[i * 2 + 1];
+ }
+ } else {
+ for (i = 0; i < 8; i++) {
+ t.u16[i] = b.u16[i * 2];
+ t.u16[i + 8] = a.u16[i * 2];
+ }
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
+ : _c_v256_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
+ : _c_v256_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ if (mode) {
+ t.u32[7] = b.u32[7];
+ t.u32[6] = b.u32[5];
+ t.u32[5] = b.u32[3];
+ t.u32[4] = b.u32[1];
+ t.u32[3] = a.u32[7];
+ t.u32[2] = a.u32[5];
+ t.u32[1] = a.u32[3];
+ t.u32[0] = a.u32[1];
+ } else {
+ t.u32[7] = a.u32[6];
+ t.u32[6] = a.u32[4];
+ t.u32[5] = a.u32[2];
+ t.u32[4] = a.u32[0];
+ t.u32[3] = b.u32[6];
+ t.u32[2] = b.u32[4];
+ t.u32[1] = b.u32[2];
+ t.u32[0] = b.u32[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
+ : _c_v256_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
+ : _c_v256_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
+ c_v128_unpacklo_u8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
+ c_v128_unpacklo_u8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
+ c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
+ c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
+ c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
+ c_v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
+ c_v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
+ c_v128_unpacklo_u16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
+ c_v128_unpacklo_s16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
+ c_v128_unpacklo_u16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
+ c_v128_unpacklo_s16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
+ c_v256 t;
+ int c;
+ for (c = 0; c < 32; c++) {
+ if (pattern.u8[c] & ~31) {
+ fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c],
+ c);
+ abort();
+ }
+ t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+ : pattern.u8[c] & 31];
+ }
+ return t;
+}
+
+// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
+SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
+ return c_v256_from_v128(
+ c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
+ c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
+ c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
+ c_v128_cmplt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
+ c_v128_cmpeq_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
+ c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
+ c_v128_cmplt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
+ c_v128_cmpeq_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, const unsigned int n) {
+ if (n < 16)
+ return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
+ c_v128_shr_n_byte(a.v128[0], 16 - n)),
+ c_v128_shl_n_byte(a.v128[0], n));
+ else if (n > 16)
+ return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
+ c_v128_zero());
+ else
+ return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, const unsigned int n) {
+ if (n < 16)
+ return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
+ c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
+ c_v128_shl_n_byte(a.v128[1], 16 - n)));
+ else if (n > 16)
+ return c_v256_from_v128(c_v128_zero(),
+ c_v128_shr_n_byte(a.v128[1], n - 16));
+ else
+ return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
+}
+
+SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, const unsigned int c) {
+ if (simd_check && c > 31) {
+ fprintf(stderr, "Error: undefined alignment %d\n", c);
+ abort();
+ }
+ return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
+ : b;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
+ c_v128_shl_8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
+ c_v128_shr_u8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
+ c_v128_shr_s8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
+ c_v128_shl_16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
+ c_v128_shr_u16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
+ c_v128_shr_s16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
+ c_v128_shl_32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
+ c_v128_shr_u32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, const unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
+ c_v128_shr_s32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, const unsigned int n) {
+ return c_v256_shl_8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, const unsigned int n) {
+ return c_v256_shl_16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, const unsigned int n) {
+ return c_v256_shl_32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, const unsigned int n) {
+ return c_v256_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, const unsigned int n) {
+ return c_v256_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, const unsigned int n) {
+ return c_v256_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, const unsigned int n) {
+ return c_v256_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, const unsigned int n) {
+ return c_v256_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, const unsigned int n) {
+ return c_v256_shr_s32(a, n);
+}
+
+#endif /* _V256_INTRINSICS_C_H */
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
new file mode 100644
index 0000000..93cccce
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -0,0 +1,525 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_V128_H
+#define _V256_INTRINSICS_V128_H
+
+#if HAVE_NEON
+#include "./v128_intrinsics_arm.h"
+#elif HAVE_SSE2
+#include "./v128_intrinsics_x86.h"
+#else
+#include "./v128_intrinsics.h"
+#endif
+
+typedef struct { v128 lo, hi; } v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); }
+
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; }
+
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+ v256 t;
+ t.hi = hi;
+ t.lo = lo;
+ return t;
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
+ v128_load_unaligned(p));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
+ v128_load_aligned(p));
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ v128_store_unaligned(p, a.lo);
+ v128_store_unaligned((uint8_t *)p + 16, a.hi);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ v128_store_aligned(p, a.lo);
+ v128_store_aligned((uint8_t *)p + 16, a.hi);
+}
+
+SIMD_INLINE v256 v256_zero() {
+ return v256_from_v128(v128_zero(), v128_zero());
+}
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) {
+ v128 t = v128_dup_8(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) {
+ v128 t = v128_dup_16(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) {
+ v128 t = v128_dup_32(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+ return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
+}
+
+typedef struct {
+ sad128_internal hi;
+ sad128_internal lo;
+} sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init() {
+ sad256_internal t;
+ t.hi = v128_sad_u8_init();
+ t.lo = v128_sad_u8_init();
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u8_sum().
+ The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+ sad256_internal t;
+ t.hi = v128_sad_u8(s.hi, a.hi, b.hi);
+ t.lo = v128_sad_u8(s.lo, a.lo, b.lo);
+ return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+ return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo);
+}
+
+typedef struct {
+ ssd128_internal hi;
+ ssd128_internal lo;
+} ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+ ssd256_internal t;
+ t.hi = v128_ssd_u8_init();
+ t.lo = v128_ssd_u8_init();
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+ ssd256_internal t;
+ t.hi = v128_ssd_u8(s.hi, a.hi, b.hi);
+ t.lo = v128_ssd_u8(s.lo, a.lo, b.lo);
+ return t;
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+ return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo);
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) {
+ return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
+ return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) {
+ return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
+ return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
+ return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
+ return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
+ return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+ return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) {
+ return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
+ v128 lo_bits = v128_mullo_s16(a, b);
+ v128 hi_bits = v128_mulhi_s16(a, b);
+ return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+ v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+ return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
+ return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return v256_from_v128(a.lo, b.lo);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return v256_from_v128(a.hi, b.hi);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_16(a.hi, a.lo),
+ v128_unziplo_16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_16(a.hi, a.lo),
+ v128_unziphi_16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_32(a.hi, a.lo),
+ v128_unziplo_32(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_32(a.hi, a.lo),
+ v128_unziphi_32(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
+ v128_pack_s32_s16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo),
+ v128_pack_s16_u8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo),
+ v128_pack_s16_s8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a.lo),
+ v128_unpacklo_u16_s32(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a.lo),
+ v128_unpacklo_s16_s32(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a.hi),
+ v128_unpacklo_u16_s32(a.hi));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a.hi),
+ v128_unpacklo_s16_s32(a.hi));
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+ v128 c16 = v128_dup_8(16);
+ v128 maskhi = v128_cmplt_s8(pattern.hi, c16);
+ v128 masklo = v128_cmplt_s8(pattern.lo, c16);
+ return v256_from_v128(
+ v128_or(
+ v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi),
+ v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)),
+ v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo),
+ v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)),
+ masklo)));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return v256_from_v128(
+ v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+ v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define v256_shl_n_byte(a, n) \
+ ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n), \
+ v128_shr_n_byte(a.lo, 16 - (n))), \
+ v128_shl_n_byte(a.lo, (n))) \
+ : v256_from_v128((n) > 16 ? v128_shl_n_byte(a.lo, (n)-16) : a.lo, \
+ v128_zero()))
+
+#define v256_shr_n_byte(a, n) \
+ ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n), \
+ v128_or(v128_shr_n_byte(a.lo, n), \
+ v128_shl_n_byte(a.hi, 16 - (n)))) \
+ : v256_from_v128(v128_zero(), \
+ (n) > 16 ? v128_shr_n_byte(a.hi, (n)-16) : a.hi))
+
+#define v256_align(a, b, c) \
+ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, n) \
+ v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n))
+#define v256_shl_n_16(a, n) \
+ v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n))
+#define v256_shl_n_32(a, n) \
+ v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n))
+#define v256_shr_n_u8(a, n) \
+ v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n))
+#define v256_shr_n_u16(a, n) \
+ v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n))
+#define v256_shr_n_u32(a, n) \
+ v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n))
+#define v256_shr_n_s8(a, n) \
+ v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n))
+#define v256_shr_n_s16(a, n) \
+ v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n))
+#define v256_shr_n_s32(a, n) \
+ v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n))
+
+#endif /* _V256_INTRINSICS_V128_H */
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
new file mode 100644
index 0000000..b5bdb53
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#if !defined(__AVX2__)
+
+#include "./v256_intrinsics_v128.h"
+
+#else
+
+// The _m256i type seems to cause problems for g++'s mangling prior to
+// version 5, but adding -fabi-version=0 fixes this.
+#if !defined(__clang__) && __GNUC__ < 5 && defined(__AVX2__) && \
+ defined(__cplusplus)
+#pragma GCC optimize "-fabi-version=0"
+#endif
+
+#include <immintrin.h>
+#include "./v128_intrinsics_x86.h"
+
+typedef __m256i v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) {
+ return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
+}
+
+SIMD_INLINE v64 v256_low_v64(v256 a) {
+ return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
+}
+
+SIMD_INLINE v128 v256_low_v128(v256 a) {
+ return _mm256_extracti128_si256(a, 0);
+}
+
+SIMD_INLINE v128 v256_high_v128(v256 a) {
+ return _mm256_extracti128_si256(a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
+ // gcc seems to be missing _mm256_set_m128i()
+ return _mm256_insertf128_si256(
+ _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return _mm256_load_si256((const __m256i *)p);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return _mm256_loadu_si256((const __m256i *)p);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ _mm256_store_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ _mm256_storeu_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+ return _mm256_adds_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+ return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+ return _mm256_subs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
+
+// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
+// lanes of lower or upper halves of a 256bit vector because the
+// unpack/pack intrinsics operate on the 256 bit input vector as 2
+// independent 128 bit vectors.
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)),
+ v128_ziplo_8(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)),
+ v128_ziplo_8(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)),
+ v128_ziplo_16(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)),
+ v128_ziplo_16(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)),
+ v128_ziplo_32(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)),
+ v128_ziplo_32(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)),
+ v128_ziplo_64(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)),
+ v128_ziplo_64(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziplo_8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziphi_8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziplo_16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziphi_16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziplo_32(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)),
+ v128_unziphi_32(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)),
+ v128_unpacklo_u8_s16(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)),
+ v128_unpacklo_u8_s16(v256_high_v128(a)));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
+ v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)),
+ v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)),
+ v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)),
+ v128_unpacklo_u16_s32(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)),
+ v128_unpacklo_s16_s32(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)),
+ v128_unpacklo_u16_s32(v256_high_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)),
+ v128_unpacklo_s16_s32(v256_high_v128(a)));
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+ v128 c16 = v128_dup_8(16);
+ v128 hi = v256_high_v128(pattern);
+ v128 lo = v256_low_v128(pattern);
+ v128 maskhi = v128_cmplt_s8(hi, c16);
+ v128 masklo = v128_cmplt_s8(lo, c16);
+ return v256_from_v128(
+ v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi),
+ v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)),
+ maskhi)),
+ v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo),
+ v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)),
+ masklo)));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return _mm256_shuffle_epi8(a, pattern);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ v256 r = _mm256_madd_epi16(a, b);
+#if defined(__x86_64__)
+ v128 t;
+ r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+ _mm256_cvtepi32_epi64(v256_low_v128(r)));
+ t = v256_low_v128(_mm256_add_epi64(
+ r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+ return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+ v128 l = v256_low_v128(r);
+ v128 h = v256_high_v128(r);
+ return (int64_t)_mm_cvtsi128_si32(l) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+ (int64_t)_mm_cvtsi128_si32(h) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+ v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
+ v128 lo = v256_low_v128(t);
+ v128 hi = v256_high_v128(t);
+ lo = v128_add_32(lo, hi);
+ return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
+}
+
+typedef v256 sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init() {
+ return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_sum().
+ The result for more than 32 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+ return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+ v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+ return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+typedef v256 ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+ return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+ v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
+ _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
+ v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
+ _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
+ v256 rl = _mm256_madd_epi16(l, l);
+ v256 rh = _mm256_madd_epi16(h, h);
+ v128 c = _mm_cvtsi32_si128(32);
+ rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
+ rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
+ rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
+ rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
+ return _mm256_add_epi64(
+ s,
+ _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+ v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+ return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
+
+SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
+ v128 lo_bits = v128_mullo_s16(a, b);
+ v128 hi_bits = v128_mulhi_s16(a, b);
+ return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+ v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return _mm256_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return _mm256_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return _mm256_mullo_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+ return _mm256_madd_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+ return _mm256_maddubs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+ return _mm256_sub_epi8(
+ _mm256_avg_epu8(a, b),
+ _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+ return _mm256_cmpgt_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+ return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+ return _mm256_cmpeq_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return _mm256_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+ return _mm256_cmpeq_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+ return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
+ _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+ return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
+ _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+ __m128i x = _mm_cvtsi32_si128(c + 8);
+ return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
+ _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+ return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+ return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+ return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+ return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+ return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+ return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+// _mm256_slli_si256 works on 128 bit lanes and can't be used
+#define v256_shl_n_byte(a, n) \
+ ((n) < 16 \
+ ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n), \
+ v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \
+ v128_shl_n_byte(v256_low_v128(a), n)) \
+ : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16), \
+ v128_zero()))
+
+// _mm256_srli_si256 works on 128 bit lanes and can't be used
+#define v256_shr_n_byte(a, n) \
+ ((n) < 16 \
+ ? _mm256_alignr_epi8( \
+ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
+ : ((n) > 16 \
+ ? _mm256_srli_si256( \
+ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \
+ (n)-16) \
+ : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1))))
+
+// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
+#define v256_align(a, b, c) \
+ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
+
+#define v256_shl_n_8(a, c) \
+ _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
+ _mm256_slli_epi16(a, c))
+#define v256_shr_n_u8(a, c) \
+ _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
+#define v256_shr_n_s8(a, c) \
+ _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
+ _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
+#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
+#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
+#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
+#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
+#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
+#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#endif
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index ffaed02..87ff34b 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -27,6 +27,10 @@
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ (void)filter_x; \
+ (void)x_step_q4; \
+ (void)filter_y; \
+ (void)y_step_q4; \
assert(filter[3] != 128); \
assert(step_q4 == 16); \
if (filter[0] | filter[1] | filter[2]) { \
diff --git a/aom_scale/generic/aom_scale.c b/aom_scale/generic/aom_scale.c
index 49084db..28604ac 100644
--- a/aom_scale/generic/aom_scale.c
+++ b/aom_scale/generic/aom_scale.c
@@ -68,24 +68,25 @@
unsigned int source_scale, unsigned int source_length,
unsigned char *dest, int dest_step,
unsigned int dest_scale, unsigned int dest_length) {
- unsigned int i, j;
- unsigned int temp;
- int source_pitch = source_step;
+ const unsigned int source_pitch = source_step;
+ const unsigned char *const dest_end = dest + dest_length * dest_step;
(void)source_length;
(void)source_scale;
(void)dest_scale;
- source_step *= 2;
- dest[0] = source[0];
+ source_step *= 2; // Every other row.
- for (i = dest_step, j = source_step; i < dest_length * dest_step;
- i += dest_step, j += source_step) {
- temp = 8;
- temp += 3 * source[j - source_pitch];
- temp += 10 * source[j];
- temp += 3 * source[j + source_pitch];
- temp >>= 4;
- dest[i] = (char)(temp);
+ dest[0] = source[0]; // Special case: 1st pixel.
+ source += source_step;
+ dest += dest_step;
+
+ while (dest < dest_end) {
+ const unsigned int a = 3 * source[-source_pitch];
+ const unsigned int b = 10 * source[0];
+ const unsigned int c = 3 * source[source_pitch];
+ *dest = (unsigned char)((8 + a + b + c) >> 4);
+ source += source_step;
+ dest += dest_step;
}
}
@@ -119,17 +120,18 @@
unsigned int source_length, unsigned char *dest,
int dest_step, unsigned int dest_scale,
unsigned int dest_length) {
- unsigned int i, j;
-
+ const unsigned char *const dest_end = dest + dest_length * dest_step;
(void)source_length;
(void)source_scale;
(void)dest_scale;
- source_step *= 2;
- j = 0;
+ source_step *= 2; // Every other row.
- for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
- dest[i] = source[j];
+ while (dest < dest_end) {
+ *dest = *source;
+ source += source_step;
+ dest += dest_step;
+ }
}
/****************************************************************************
*
@@ -159,12 +161,12 @@
unsigned int source_scale, unsigned int source_length,
unsigned char *dest, int dest_step,
unsigned int dest_scale, unsigned int dest_length) {
- unsigned int i;
- unsigned int round_value = dest_scale / 2;
+ const unsigned char *const dest_end = dest + dest_length * dest_step;
+ const unsigned int round_value = dest_scale / 2;
unsigned int left_modifier = dest_scale;
unsigned int right_modifier = 0;
- unsigned char left_pixel = *source;
- unsigned char right_pixel = *(source + source_step);
+ unsigned char left_pixel = source[0];
+ unsigned char right_pixel = source[source_step];
(void)source_length;
@@ -173,18 +175,18 @@
/* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) *
* source_scale);*/
- for (i = 0; i < dest_length * dest_step; i += dest_step) {
- dest[i] = (char)((left_modifier * left_pixel +
- right_modifier * right_pixel + round_value) /
- dest_scale);
+ while (dest < dest_end) {
+ *dest = (unsigned char)((left_modifier * left_pixel +
+ right_modifier * right_pixel + round_value) /
+ dest_scale);
right_modifier += source_scale;
while (right_modifier > dest_scale) {
right_modifier -= dest_scale;
source += source_step;
- left_pixel = *source;
- right_pixel = *(source + source_step);
+ left_pixel = source[0];
+ right_pixel = source[source_step];
}
left_modifier = dest_scale - right_modifier;
@@ -236,11 +238,10 @@
unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area,
unsigned char temp_area_height, unsigned int hscale, unsigned int hratio,
unsigned int vscale, unsigned int vratio, unsigned int interlaced) {
- /*unsigned*/
- int i, j, k;
- int bands;
- int dest_band_height;
- int source_band_height;
+ unsigned int i, j, k;
+ unsigned int bands;
+ unsigned int dest_band_height;
+ unsigned int source_band_height;
typedef void (*Scale1D)(const unsigned char *source, int source_step,
unsigned int source_scale, unsigned int source_length,
@@ -331,7 +332,7 @@
if (ratio_scalable) {
if (source_height == dest_height) {
/* for each band of the image */
- for (k = 0; k < (int)dest_height; k++) {
+ for (k = 0; k < dest_height; ++k) {
horiz_line_scale(source, source_width, dest, dest_width);
source += source_pitch;
dest += dest_pitch;
@@ -346,14 +347,13 @@
horiz_line_scale(source, source_width, temp_area, dest_width);
}
- for (k = 0;
- k < (int)(dest_height + dest_band_height - 1) / dest_band_height;
- k++) {
+ for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height;
+ ++k) {
/* scale one band horizontally */
- for (i = 0; i < source_band_height; i++) {
+ for (i = 0; i < source_band_height; ++i) {
/* Trap case where we could read off the base of the source buffer */
- line_src = (unsigned char *)source + i * source_pitch;
+ line_src = source + i * source_pitch;
if (line_src < source_base) line_src = source_base;
@@ -388,7 +388,7 @@
if (source_height == dest_height) {
/* for each band of the image */
- for (k = 0; k < (int)dest_height; k++) {
+ for (k = 0; k < dest_height; ++k) {
Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio,
dest_width);
source += source_pitch;
@@ -414,10 +414,10 @@
/* for each band of the image */
bands = (dest_height + dest_band_height - 1) / dest_band_height;
- for (k = 0; k < bands; k++) {
+ for (k = 0; k < bands; ++k) {
/* scale one band horizontally */
- for (i = 1; i < source_band_height + 1; i++) {
- if (k * source_band_height + i < (int)source_height) {
+ for (i = 1; i < source_band_height + 1; ++i) {
+ if (k * source_band_height + i < source_height) {
Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
temp_area + i * dest_pitch, 1, hratio, dest_width);
} else { /* Duplicate the last row */
@@ -428,7 +428,7 @@
}
/* scale one band vertically */
- for (j = 0; j < (int)dest_width; j++) {
+ for (j = 0; j < dest_width; ++j) {
Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
&dest[j], dest_pitch, vratio, dest_band_height);
}
@@ -487,12 +487,12 @@
temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
if (dw < (int)dst->y_width)
- for (i = 0; i < dh; i++)
+ for (i = 0; i < dh; ++i)
memset(dst->y_buffer + i * dst->y_stride + dw - 1,
dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
if (dh < (int)dst->y_height)
- for (i = dh - 1; i < (int)dst->y_height; i++)
+ for (i = dh - 1; i < (int)dst->y_height; ++i)
memcpy(dst->y_buffer + i * dst->y_stride,
dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
@@ -502,13 +502,13 @@
vratio, interlaced);
if (dw / 2 < (int)dst->uv_width)
- for (i = 0; i < dst->uv_height; i++)
+ for (i = 0; i < dst->uv_height; ++i)
memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1,
dst->u_buffer[i * dst->uv_stride + dw / 2 - 2],
dst->uv_width - dw / 2 + 1);
if (dh / 2 < (int)dst->uv_height)
- for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+ for (i = dh / 2 - 1; i < (int)dst->y_height / 2; ++i)
memcpy(dst->u_buffer + i * dst->uv_stride,
dst->u_buffer + (dh / 2 - 2) * dst->uv_stride, dst->uv_width);
@@ -518,13 +518,13 @@
vratio, interlaced);
if (dw / 2 < (int)dst->uv_width)
- for (i = 0; i < dst->uv_height; i++)
+ for (i = 0; i < dst->uv_height; ++i)
memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1,
dst->v_buffer[i * dst->uv_stride + dw / 2 - 2],
dst->uv_width - dw / 2 + 1);
if (dh / 2 < (int)dst->uv_height)
- for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+ for (i = dh / 2 - 1; i < (int)dst->y_height / 2; ++i)
memcpy(dst->v_buffer + i * dst->uv_stride,
dst->v_buffer + (dh / 2 - 2) * dst->uv_stride, dst->uv_width);
}
diff --git a/aom_scale/generic/gen_scalers.c b/aom_scale/generic/gen_scalers.c
index 57c464d..fd638bd 100644
--- a/aom_scale/generic/gen_scalers.c
+++ b/aom_scale/generic/gen_scalers.c
@@ -39,27 +39,23 @@
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width) {
- unsigned i;
- unsigned int a, b, c, d, e;
- unsigned char *des = dest;
- const unsigned char *src = source;
-
+ const unsigned char *const source_end = source + source_width;
(void)dest_width;
- for (i = 0; i < source_width; i += 5) {
- a = src[0];
- b = src[1];
- c = src[2];
- d = src[3];
- e = src[4];
+ while (source < source_end) {
+ const unsigned int a = source[0];
+ const unsigned int b = source[1];
+ const unsigned int c = source[2];
+ const unsigned int d = source[3];
+ const unsigned int e = source[4];
- des[0] = (unsigned char)a;
- des[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
- des[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
- des[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+ dest[0] = (unsigned char)a;
+ dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+ dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+ dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
- src += 5;
- des += 4;
+ source += 5;
+ dest += 4;
}
}
@@ -67,25 +63,21 @@
unsigned int src_pitch, unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width) {
- unsigned int i;
- unsigned int a, b, c, d, e;
- unsigned char *des = dest;
- unsigned char *src = source;
+ const unsigned char *const dest_end = dest + dest_width;
+ while (dest < dest_end) {
+ const unsigned int a = source[0 * src_pitch];
+ const unsigned int b = source[1 * src_pitch];
+ const unsigned int c = source[2 * src_pitch];
+ const unsigned int d = source[3 * src_pitch];
+ const unsigned int e = source[4 * src_pitch];
- for (i = 0; i < dest_width; i++) {
- a = src[0 * src_pitch];
- b = src[1 * src_pitch];
- c = src[2 * src_pitch];
- d = src[3 * src_pitch];
- e = src[4 * src_pitch];
+ dest[0 * dest_pitch] = (unsigned char)a;
+ dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+ dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+ dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
- des[0 * dest_pitch] = (unsigned char)a;
- des[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
- des[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
- des[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
-
- src++;
- des++;
+ ++source;
+ ++dest;
}
}
@@ -114,26 +106,21 @@
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width) {
- unsigned int i;
- unsigned int a, b, c, d, e;
- unsigned char *des = dest;
- const unsigned char *src = source;
-
+ const unsigned char *const source_end = source + source_width;
(void)dest_width;
+ while (source < source_end) {
+ const unsigned int a = source[0];
+ const unsigned int b = source[1];
+ const unsigned int c = source[2];
+ const unsigned int d = source[3];
+ const unsigned int e = source[4];
- for (i = 0; i < source_width; i += 5) {
- a = src[0];
- b = src[1];
- c = src[2];
- d = src[3];
- e = src[4];
+ dest[0] = (unsigned char)a;
+ dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+ dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
- des[0] = (unsigned char)a;
- des[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
- des[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
-
- src += 5;
- des += 3;
+ source += 5;
+ dest += 3;
}
}
@@ -141,24 +128,20 @@
unsigned int src_pitch, unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width) {
- unsigned int i;
- unsigned int a, b, c, d, e;
- unsigned char *des = dest;
- unsigned char *src = source;
+ const unsigned char *const dest_end = dest + dest_width;
+ while (dest < dest_end) {
+ const unsigned int a = source[0 * src_pitch];
+ const unsigned int b = source[1 * src_pitch];
+ const unsigned int c = source[2 * src_pitch];
+ const unsigned int d = source[3 * src_pitch];
+ const unsigned int e = source[4 * src_pitch];
- for (i = 0; i < dest_width; i++) {
- a = src[0 * src_pitch];
- b = src[1 * src_pitch];
- c = src[2 * src_pitch];
- d = src[3 * src_pitch];
- e = src[4 * src_pitch];
+ dest[0 * dest_pitch] = (unsigned char)a;
+ dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+ dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
- des[0 * dest_pitch] = (unsigned char)a;
- des[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
- des[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
-
- src++;
- des++;
+ ++source;
+ ++dest;
}
}
@@ -186,18 +169,12 @@
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width) {
- unsigned int i;
- unsigned int a;
- unsigned char *des = dest;
- const unsigned char *src = source;
-
+ const unsigned char *const source_end = source + source_width;
(void)dest_width;
-
- for (i = 0; i < source_width; i += 2) {
- a = src[0];
- des[0] = (unsigned char)(a);
- src += 2;
- des += 1;
+ while (source < source_end) {
+ dest[0] = source[0];
+ source += 2;
+ ++dest;
}
}
@@ -215,18 +192,14 @@
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width) {
- int i;
- int temp;
- int width = dest_width;
-
+ const unsigned char *const dest_end = dest + dest_width;
(void)dest_pitch;
-
- for (i = 0; i < width; i++) {
- temp = 8;
- temp += source[i - (int)src_pitch] * 3;
- temp += source[i] * 10;
- temp += source[i + src_pitch] * 3;
- temp >>= 4;
- dest[i] = (unsigned char)(temp);
+ while (dest < dest_end) {
+ const unsigned int a = source[-src_pitch] * 3;
+ const unsigned int b = source[0] * 10;
+ const unsigned int c = source[src_pitch] * 3;
+ dest[0] = (unsigned char)((8 + a + b + c) >> 4);
+ ++source;
+ ++dest;
}
}
diff --git a/aomenc.c b/aomenc.c
index 8eb30ed..497c8d5 100644
--- a/aomenc.c
+++ b/aomenc.c
@@ -1415,9 +1415,8 @@
#if CONFIG_WEBM_IO
if (stream->config.write_webm) {
stream->webm_ctx.stream = stream->file;
- write_webm_file_header(&stream->webm_ctx, cfg, &global->framerate,
- stream->config.stereo_fmt, global->codec->fourcc,
- pixel_aspect_ratio);
+ write_webm_file_header(&stream->webm_ctx, cfg, stream->config.stereo_fmt,
+ global->codec->fourcc, pixel_aspect_ratio);
}
#else
(void)pixel_aspect_ratio;
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 7da80f0..43cc3a2 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -828,7 +828,7 @@
static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
va_list args) {
- aom_ref_frame_t *data = va_arg(args, aom_ref_frame_t *);
+ const aom_ref_frame_t *const frame = va_arg(args, aom_ref_frame_t *);
// Only support this function in serial decode.
if (ctx->frame_parallel_decode) {
@@ -836,8 +836,7 @@
return AOM_CODEC_INCAPABLE;
}
- if (data) {
- aom_ref_frame_t *frame = (aom_ref_frame_t *)data;
+ if (frame) {
YV12_BUFFER_CONFIG sd;
AVxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 8938bbf..6332fed 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -92,40 +92,38 @@
}
void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
- int aoff, int loff) {
+ TX_SIZE tx_size, int has_eob, int aoff, int loff) {
ENTROPY_CONTEXT *const a = pd->above_context + aoff;
ENTROPY_CONTEXT *const l = pd->left_context + loff;
- const int tx_w_in_blocks = num_4x4_blocks_wide_txsize_lookup[tx_size];
- const int tx_h_in_blocks = num_4x4_blocks_high_txsize_lookup[tx_size];
+ const int tx_size_in_blocks = 1 << tx_size;
// above
if (has_eob && xd->mb_to_right_edge < 0) {
int i;
- const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
- (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
- int above_contexts = tx_w_in_blocks;
+ const int blocks_wide =
+ pd->n4_w + (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ int above_contexts = tx_size_in_blocks;
if (above_contexts + aoff > blocks_wide)
above_contexts = blocks_wide - aoff;
for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
- for (i = above_contexts; i < tx_w_in_blocks; ++i) a[i] = 0;
+ for (i = above_contexts; i < tx_size_in_blocks; ++i) a[i] = 0;
} else {
- memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_w_in_blocks);
+ memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
}
// left
if (has_eob && xd->mb_to_bottom_edge < 0) {
int i;
- const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
- (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
- int left_contexts = tx_h_in_blocks;
+ const int blocks_high =
+ pd->n4_h + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+ int left_contexts = tx_size_in_blocks;
if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
- for (i = left_contexts; i < tx_h_in_blocks; ++i) l[i] = 0;
+ for (i = left_contexts; i < tx_size_in_blocks; ++i) l[i] = 0;
} else {
- memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_h_in_blocks);
+ memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
}
}
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 6296faa..bc1970c 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -191,7 +191,6 @@
TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
#endif
int8_t skip;
- int8_t has_no_coeffs;
int8_t segment_id;
#if CONFIG_SUPERTX
// Minimum of all segment IDs under the current supertx block.
@@ -757,8 +756,7 @@
void *arg);
void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
- int aoff, int loff);
+ TX_SIZE tx_size, int has_eob, int aoff, int loff);
#if CONFIG_EXT_INTER
static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 20e8904..78f4ffe 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -892,13 +892,14 @@
#if CONFIG_PALETTE
int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
- int c, int n, int *color_order) {
+ int c, int n, uint8_t *color_order,
+ int *color_idx) {
int i, j, max, max_idx, temp;
int scores[PALETTE_MAX_SIZE + 10];
int weights[4] = { 3, 2, 3, 2 };
int color_ctx = 0;
int color_neighbors[4];
-
+ int inverse_color_order[PALETTE_MAX_SIZE];
assert(n <= PALETTE_MAX_SIZE);
if (c - 1 >= 0)
@@ -918,7 +919,10 @@
else
color_neighbors[3] = -1;
- for (i = 0; i < PALETTE_MAX_SIZE; ++i) color_order[i] = i;
+ for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+ color_order[i] = i;
+ inverse_color_order[i] = i;
+ }
memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
for (i = 0; i < 4; ++i) {
if (color_neighbors[i] >= 0) scores[color_neighbors[i]] += weights[i];
@@ -944,6 +948,8 @@
temp = color_order[i];
color_order[i] = color_order[max_idx];
color_order[max_idx] = temp;
+ inverse_color_order[color_order[i]] = i;
+ inverse_color_order[color_order[max_idx]] = max_idx;
}
}
@@ -956,7 +962,9 @@
}
if (color_ctx >= PALETTE_COLOR_CONTEXTS) color_ctx = 0;
-
+ if (color_idx != NULL) {
+ *color_idx = inverse_color_order[color_map[r * cols + c]];
+ }
return color_ctx;
}
#endif // CONFIG_PALETTE
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 85c68e1..68a6400 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -359,7 +359,8 @@
#if CONFIG_PALETTE
int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
- int c, int n, int *color_order);
+ int c, int n, uint8_t *color_order,
+ int *color_idx);
#endif // CONFIG_PALETTE
#ifdef __cplusplus
diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c
index c8022f2..d0b897c 100644
--- a/av1/common/loopfilter.c
+++ b/av1/common/loopfilter.c
@@ -753,7 +753,7 @@
// If the block has no coefficients and is not intra we skip applying
// the loop filter on block edges.
- if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi)) return;
+ if (mbmi->skip && is_inter_block(mbmi)) return;
// Here we are adding a mask for the transform size. The transform
// size mask is set to be correct for a 64x64 prediction block size. We
@@ -818,7 +818,7 @@
*above_y |= above_prediction_mask[block_size] << shift_y;
*left_y |= left_prediction_mask[block_size] << shift_y;
- if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi)) return;
+ if (mbmi->skip && is_inter_block(mbmi)) return;
*above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y])
<< shift_y;
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index afc9da4..3c8eac8 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -520,6 +520,17 @@
return len + MAX_MIB_SIZE;
}
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
+ int bhl) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+ xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+ xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
+ xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+ }
+}
+
static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
int mi_row, int bh, int mi_col, int bw,
int mi_rows, int mi_cols) {
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index c0fc494..b07a8bd 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -647,6 +647,87 @@
}
#endif
+#if CONFIG_SUB8X8_MC
+ if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0) {
+ // block size in log2
+ const int b4_wl = b_width_log2_lookup[mi->mbmi.sb_type];
+ const int b4_hl = b_height_log2_lookup[mi->mbmi.sb_type];
+ const int b8_sl = b_width_log2_lookup[BLOCK_8X8];
+
+ // block size
+ const int b4_w = 1 << b4_wl;
+ const int b4_h = 1 << b4_hl;
+ const int b8_s = 1 << b8_sl;
+ int idx, idy;
+
+ const int x_base = x;
+ const int y_base = y;
+
+ // processing unit size
+ const int x_step = w >> (b8_sl - b4_wl);
+ const int y_step = h >> (b8_sl - b4_hl);
+
+ for (idy = 0; idy < b8_s; idy += b4_h) {
+ for (idx = 0; idx < b8_s; idx += b4_w) {
+ const int chr_idx = (idy * 2) + idx;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = &pd->pre[ref];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *dst = dst_buf->buf;
+ const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ uint8_t *pre;
+ MV32 scaled_mv;
+ int xs, ys, subpel_x, subpel_y;
+ const int is_scaled = av1_is_scaled(sf);
+
+ x = x_base + idx * x_step;
+ y = y_base + idy * y_step;
+
+ dst += dst_buf->stride * y + x;
+
+ if (is_scaled) {
+ pre =
+ pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+ scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+ xs = sf->x_step_q4;
+ ys = sf->y_step_q4;
+ } else {
+ pre = pre_buf->buf + y * pre_buf->stride + x;
+ scaled_mv.row = mv_q4.row;
+ scaled_mv.col = mv_q4.col;
+ xs = ys = 16;
+ }
+
+ subpel_x = scaled_mv.col & SUBPEL_MASK;
+ subpel_y = scaled_mv.row & SUBPEL_MASK;
+ pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
+ (scaled_mv.col >> SUBPEL_BITS);
+
+#if CONFIG_AOM_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+ subpel_x, subpel_y, sf, x_step, y_step, ref,
+ &mi->mbmi.interp_filter, xs, ys, xd->bd);
+ } else {
+ inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+ subpel_x, subpel_y, sf, x_step, y_step, ref,
+ &mi->mbmi.interp_filter, xs, ys);
+ }
+#else
+ inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, x_step, y_step, ref,
+ &mi->mbmi.interp_filter, xs, ys);
+#endif
+ }
+ }
+ }
+ return;
+ }
+#endif
+
for (ref = 0; ref < 1 + is_compound; ++ref) {
const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
struct buf_2d *const pre_buf = &pd->pre[ref];
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index bfa7e95..5f62f0a 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -50,7 +50,7 @@
const int16_t *kernel_y =
av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
#else
- if (interp_filter_params.taps == SUBPEL_TAPS) {
+ if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
const int16_t *kernel_x =
av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
const int16_t *kernel_y =
@@ -109,7 +109,7 @@
const int16_t *kernel_y =
av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
#else
- if (interp_filter_params.taps == SUBPEL_TAPS) {
+ if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
const int16_t *kernel_x =
av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
const int16_t *kernel_y =
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 27640b7..146ca23 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -383,17 +383,6 @@
}
}
-static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
- int bhl) {
- int i;
- for (i = 0; i < MAX_MB_PLANE; i++) {
- xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
- xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
- xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
- xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
- }
-}
-
static MB_MODE_INFO *set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
BLOCK_SIZE bsize, int mi_row, int mi_col,
int bw, int bh, int x_mis, int y_mis, int bwl,
@@ -1153,7 +1142,6 @@
#endif // CONFIG_EXT_PARTITION_TYPES
BLOCK_SIZE bsize, int bwl, int bhl) {
AV1_COMMON *const cm = &pbi->common;
- const int less8x8 = bsize < BLOCK_8X8;
const int bw = 1 << (bwl - 1);
const int bh = 1 << (bhl - 1);
const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
@@ -1373,9 +1361,6 @@
row, col, tx_size);
#endif
}
-
- if (!less8x8 && eobtotal == 0)
- mbmi->has_no_coeffs = 1; // skip loopfilter
}
}
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 4d181fd..8260f9d 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -718,9 +718,15 @@
!segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
FRAME_COUNTS *counts = xd->counts;
TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+#if CONFIG_DAALA_EC
+ mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
+ r, cm->fc->intra_ext_tx_cdf[mbmi->tx_size][tx_type_nom], TX_TYPES,
+ ACCT_STR)];
+#else
mbmi->tx_type = aom_read_tree(
r, av1_ext_tx_tree,
cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom], ACCT_STR);
+#endif
if (counts)
++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
} else {
@@ -740,7 +746,7 @@
// Integer part
if (class0) {
- d = aom_read_tree(r, av1_mv_class0_tree, mvcomp->class0, ACCT_STR);
+ d = aom_read(r, mvcomp->class0[0], ACCT_STR);
mag = 0;
} else {
int i;
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 7077788..f2f74f5 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -294,59 +294,18 @@
return c;
}
-// TODO(slavarnway): Decode version of av1_set_context. Modify
-// av1_set_context
-// after testing is complete, then delete this version.
-static void dec_set_contexts(const MACROBLOCKD *xd,
- struct macroblockd_plane *pd, TX_SIZE tx_size,
- int has_eob, int aoff, int loff) {
- ENTROPY_CONTEXT *const a = pd->above_context + aoff;
- ENTROPY_CONTEXT *const l = pd->left_context + loff;
- const int tx_w_in_blocks = num_4x4_blocks_wide_txsize_lookup[tx_size];
- const int tx_h_in_blocks = num_4x4_blocks_high_txsize_lookup[tx_size];
-
- // above
- if (has_eob && xd->mb_to_right_edge < 0) {
- int i;
- const int blocks_wide =
- pd->n4_w + (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
- int above_contexts = tx_w_in_blocks;
- if (above_contexts + aoff > blocks_wide)
- above_contexts = blocks_wide - aoff;
-
- for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
- for (i = above_contexts; i < tx_w_in_blocks; ++i) a[i] = 0;
- } else {
- memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_w_in_blocks);
- }
-
- // left
- if (has_eob && xd->mb_to_bottom_edge < 0) {
- int i;
- const int blocks_high =
- pd->n4_h + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
- int left_contexts = tx_h_in_blocks;
- if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
-
- for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
- for (i = left_contexts; i < tx_h_in_blocks; ++i) l[i] = 0;
- } else {
- memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_h_in_blocks);
- }
-}
-
#if CONFIG_PALETTE
void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
aom_reader *r) {
- MODE_INFO *const mi = xd->mi[0];
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const MODE_INFO *const mi = xd->mi[0];
+ const MB_MODE_INFO *const mbmi = &mi->mbmi;
const BLOCK_SIZE bsize = mbmi->sb_type;
const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
(xd->plane[plane != 0].subsampling_y);
const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
(xd->plane[plane != 0].subsampling_x);
- int color_idx, color_ctx, color_order[PALETTE_MAX_SIZE];
- int n = mbmi->palette_mode_info.palette_size[plane != 0];
+ uint8_t color_order[PALETTE_MAX_SIZE];
+ const int n = mbmi->palette_mode_info.palette_size[plane != 0];
int i, j;
uint8_t *color_map = xd->plane[plane != 0].color_index_map;
const aom_prob(*const prob)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
@@ -355,10 +314,10 @@
for (i = 0; i < rows; ++i) {
for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
- color_ctx =
- av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
- color_idx = aom_read_tree(r, av1_palette_color_tree[n - 2],
- prob[n - 2][color_ctx], ACCT_STR);
+ const int color_ctx = av1_get_palette_color_context(color_map, cols, i, j,
+ n, color_order, NULL);
+ const int color_idx = aom_read_tree(r, av1_palette_color_tree[n - 2],
+ prob[n - 2][color_ctx], ACCT_STR);
assert(color_idx >= 0 && color_idx < n);
color_map[i * cols + j] = color_order[color_idx];
}
@@ -391,11 +350,6 @@
#endif // CONFIG_NEW_QUANT
ctx, sc->scan, sc->neighbors, r);
#endif // CONFIG_AOM_QM
- dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
- /*
- av1_set_contexts(xd, pd,
- get_plane_block_size(xd->mi[0]->mbmi.sb_type, pd),
- tx_size, eob > 0, x, y);
- */
+ av1_set_contexts(xd, pd, tx_size, eob > 0, x, y);
return eob;
}
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 7068604..e0fb7ec 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1666,11 +1666,19 @@
#else
if (mbmi->tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
!segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+#if CONFIG_DAALA_EC
+ aom_write_symbol(
+ w, av1_ext_tx_ind[mbmi->tx_type],
+ cm->fc->intra_ext_tx_cdf[mbmi->tx_size]
+ [intra_mode_to_tx_type_context[mbmi->mode]],
+ TX_TYPES);
+#else
av1_write_token(
w, av1_ext_tx_tree,
cm->fc->intra_ext_tx_prob[mbmi->tx_size]
[intra_mode_to_tx_type_context[mbmi->mode]],
&ext_tx_encodings[mbmi->tx_type]);
+#endif
}
#endif // CONFIG_EXT_TX
}
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 04d5282..d3b97d6 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -262,6 +262,8 @@
MACROBLOCKD *const xd = &x->e_mbd;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int bwl = b_width_log2_lookup[AOMMAX(bsize, BLOCK_8X8)];
+ const int bhl = b_height_log2_lookup[AOMMAX(bsize, BLOCK_8X8)];
set_skip_context(xd, mi_row, mi_col);
@@ -284,6 +286,8 @@
x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+ set_plane_n4(xd, mi_width, mi_height, bwl, bhl);
+
// Set up distance of MB to edge of frame in 1/8th pel units.
assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index da6f35c..7276fee 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -23,12 +23,10 @@
static struct av1_token mv_joint_encodings[MV_JOINTS];
static struct av1_token mv_class_encodings[MV_CLASSES];
static struct av1_token mv_fp_encodings[MV_FP_SIZE];
-static struct av1_token mv_class0_encodings[CLASS0_SIZE];
void av1_entropy_mv_init(void) {
av1_tokens_from_tree(mv_joint_encodings, av1_mv_joint_tree);
av1_tokens_from_tree(mv_class_encodings, av1_mv_class_tree);
- av1_tokens_from_tree(mv_class0_encodings, av1_mv_class0_tree);
av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree);
}
@@ -53,8 +51,7 @@
// Integer bits
if (mv_class == MV_CLASS_0) {
- av1_write_token(w, av1_mv_class0_tree, mvcomp->class0,
- &mv_class0_encodings[d]);
+ aom_write(w, d, mvcomp->class0[0]);
} else {
int i;
const int n = mv_class + CLASS0_BITS - 1; // number of bits
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index c22c5a8..f1a6f72 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -422,39 +422,6 @@
aom_free(cpi->segmentation_map);
cpi->segmentation_map = NULL;
-#if CONFIG_REF_MV
- for (i = 0; i < NMV_CONTEXTS; ++i) {
- aom_free(cpi->nmv_costs[i][0]);
- aom_free(cpi->nmv_costs[i][1]);
- aom_free(cpi->nmv_costs_hp[i][0]);
- aom_free(cpi->nmv_costs_hp[i][1]);
- cpi->nmv_costs[i][0] = NULL;
- cpi->nmv_costs[i][1] = NULL;
- cpi->nmv_costs_hp[i][0] = NULL;
- cpi->nmv_costs_hp[i][1] = NULL;
- }
-#endif
-
- aom_free(cpi->nmvcosts[0]);
- aom_free(cpi->nmvcosts[1]);
- cpi->nmvcosts[0] = NULL;
- cpi->nmvcosts[1] = NULL;
-
- aom_free(cpi->nmvcosts_hp[0]);
- aom_free(cpi->nmvcosts_hp[1]);
- cpi->nmvcosts_hp[0] = NULL;
- cpi->nmvcosts_hp[1] = NULL;
-
- aom_free(cpi->nmvsadcosts[0]);
- aom_free(cpi->nmvsadcosts[1]);
- cpi->nmvsadcosts[0] = NULL;
- cpi->nmvsadcosts[1] = NULL;
-
- aom_free(cpi->nmvsadcosts_hp[0]);
- aom_free(cpi->nmvsadcosts_hp[1]);
- cpi->nmvsadcosts_hp[0] = NULL;
- cpi->nmvsadcosts_hp[1] = NULL;
-
av1_cyclic_refresh_free(cpi->cyclic_refresh);
cpi->cyclic_refresh = NULL;
@@ -512,27 +479,15 @@
#if CONFIG_REF_MV
for (i = 0; i < NMV_CONTEXTS; ++i) {
av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
- memcpy(cc->nmv_costs[i][0], cpi->nmv_costs[i][0],
- MV_VALS * sizeof(*cpi->nmv_costs[i][0]));
- memcpy(cc->nmv_costs[i][1], cpi->nmv_costs[i][1],
- MV_VALS * sizeof(*cpi->nmv_costs[i][1]));
- memcpy(cc->nmv_costs_hp[i][0], cpi->nmv_costs_hp[i][0],
- MV_VALS * sizeof(*cpi->nmv_costs_hp[i][0]));
- memcpy(cc->nmv_costs_hp[i][1], cpi->nmv_costs_hp[i][1],
- MV_VALS * sizeof(*cpi->nmv_costs_hp[i][1]));
+ av1_copy(cc->nmv_costs, cpi->nmv_costs);
+ av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
}
#else
av1_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost);
#endif
- memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
- MV_VALS * sizeof(*cpi->nmvcosts[0]));
- memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
- MV_VALS * sizeof(*cpi->nmvcosts[1]));
- memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
- MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
- memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
- MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
+ av1_copy(cc->nmvcosts, cpi->nmvcosts);
+ av1_copy(cc->nmvcosts_hp, cpi->nmvcosts_hp);
av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
@@ -552,25 +507,15 @@
#if CONFIG_REF_MV
for (i = 0; i < NMV_CONTEXTS; ++i) {
av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
- memcpy(cpi->nmv_costs[i][0], cc->nmv_costs[i][0],
- MV_VALS * sizeof(*cc->nmv_costs[i][0]));
- memcpy(cpi->nmv_costs[i][1], cc->nmv_costs[i][1],
- MV_VALS * sizeof(*cc->nmv_costs[i][1]));
- memcpy(cpi->nmv_costs_hp[i][0], cc->nmv_costs_hp[i][0],
- MV_VALS * sizeof(*cc->nmv_costs_hp[i][0]));
- memcpy(cpi->nmv_costs_hp[i][1], cc->nmv_costs_hp[i][1],
- MV_VALS * sizeof(*cc->nmv_costs_hp[i][1]));
+ av1_copy(cpi->nmv_costs, cc->nmv_costs);
+ av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
}
#else
av1_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
#endif
- memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
- memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
- memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
- MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
- memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
- MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
+ av1_copy(cpi->nmvcosts, cc->nmvcosts);
+ av1_copy(cpi->nmvcosts_hp, cc->nmvcosts_hp);
av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
@@ -2117,33 +2062,15 @@
#if CONFIG_REF_MV
for (i = 0; i < NMV_CONTEXTS; ++i) {
- CHECK_MEM_ERROR(cm, cpi->nmv_costs[i][0],
- aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs[i][0])));
- CHECK_MEM_ERROR(cm, cpi->nmv_costs[i][1],
- aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs[i][1])));
- CHECK_MEM_ERROR(cm, cpi->nmv_costs_hp[i][0],
- aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs_hp[i][0])));
- CHECK_MEM_ERROR(cm, cpi->nmv_costs_hp[i][1],
- aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs_hp[i][1])));
+ memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+ memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
}
#endif
- CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
- aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
- aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
- aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
- aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
- aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
- aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
- aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
- aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+ memset(cpi->nmvcosts, 0, sizeof(cpi->nmvcosts));
+ memset(cpi->nmvcosts_hp, 0, sizeof(cpi->nmvcosts_hp));
+ memset(cpi->nmvsadcosts, 0, sizeof(cpi->nmvsadcosts));
+ memset(cpi->nmvsadcosts_hp, 0, sizeof(cpi->nmvsadcosts_hp));
for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
i++) {
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index b55481b..0c66905 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -414,14 +414,14 @@
CODING_CONTEXT coding_context;
#if CONFIG_REF_MV
- int *nmv_costs[NMV_CONTEXTS][2];
- int *nmv_costs_hp[NMV_CONTEXTS][2];
+ int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
+ int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
#endif
- int *nmvcosts[2];
- int *nmvcosts_hp[2];
- int *nmvsadcosts[2];
- int *nmvsadcosts_hp[2];
+ int nmvcosts[2][MV_VALS];
+ int nmvcosts_hp[2][MV_VALS];
+ int nmvsadcosts[2][MV_VALS];
+ int nmvsadcosts_hp[2][MV_VALS];
int64_t last_time_stamp_seen;
int64_t last_end_time_stamp_seen;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 8d151a7..8ba6b7b 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1771,8 +1771,7 @@
if (colors > 1 && colors <= 64) {
int r, c, i, j, k;
const int max_itr = 50;
- int color_ctx, color_idx = 0;
- int color_order[PALETTE_MAX_SIZE];
+ uint8_t color_order[PALETTE_MAX_SIZE];
float *const data = x->palette_buffer->kmeans_data_buf;
float centroids[PALETTE_MAX_SIZE];
uint8_t *const color_map = xd->plane[0].color_index_map;
@@ -1856,13 +1855,9 @@
1);
for (i = 0; i < rows; ++i) {
for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
- color_ctx = av1_get_palette_color_context(color_map, cols, i, j, k,
- color_order);
- for (r = 0; r < k; ++r)
- if (color_map[i * cols + j] == color_order[r]) {
- color_idx = r;
- break;
- }
+ int color_idx;
+ const int color_ctx = av1_get_palette_color_context(
+ color_map, cols, i, j, k, color_order, &color_idx);
assert(color_idx >= 0 && color_idx < k);
this_rate += cpi->palette_y_color_cost[k - 2][color_ctx][color_idx];
}
@@ -2507,7 +2502,7 @@
static void angle_estimation(const uint8_t *src, int src_stride, int rows,
int cols, uint8_t *directional_mode_skip_mask) {
- int i, r, c, dx, dy, temp, sn, remd, quot;
+ int i, r, c, index, dx, dy, temp, sn, remd, quot;
uint64_t hist[DIRECTIONAL_MODES];
uint64_t hist_sum = 0;
@@ -2515,7 +2510,6 @@
src += src_stride;
for (r = 1; r < rows; ++r) {
for (c = 1; c < cols; ++c) {
- uint8_t index;
dx = src[c] - src[c - 1];
dy = src[c] - src[c - src_stride];
temp = dx * dx + dy * dy;
@@ -2538,16 +2532,16 @@
for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
for (i = 0; i < INTRA_MODES; ++i) {
if (i != DC_PRED && i != TM_PRED) {
- const uint8_t index = mode_to_angle_bin[i];
- uint64_t score = 2 * hist[index];
+ const uint8_t angle_bin = mode_to_angle_bin[i];
+ uint64_t score = 2 * hist[angle_bin];
int weight = 2;
- if (index > 0) {
- score += hist[index - 1];
- weight += 1;
+ if (angle_bin > 0) {
+ score += hist[angle_bin - 1];
+ ++weight;
}
- if (index < DIRECTIONAL_MODES - 1) {
- score += hist[index + 1];
- weight += 1;
+ if (angle_bin < DIRECTIONAL_MODES - 1) {
+ score += hist[angle_bin + 1];
+ ++weight;
}
if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
directional_mode_skip_mask[i] = 1;
@@ -2559,7 +2553,7 @@
static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
int rows, int cols,
uint8_t *directional_mode_skip_mask) {
- int i, r, c, dx, dy, temp, sn, remd, quot;
+ int i, r, c, index, dx, dy, temp, sn, remd, quot;
uint64_t hist[DIRECTIONAL_MODES];
uint64_t hist_sum = 0;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -2568,7 +2562,6 @@
src += src_stride;
for (r = 1; r < rows; ++r) {
for (c = 1; c < cols; ++c) {
- uint8_t index;
dx = src[c] - src[c - 1];
dy = src[c] - src[c - src_stride];
temp = dx * dx + dy * dy;
@@ -2591,16 +2584,16 @@
for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
for (i = 0; i < INTRA_MODES; ++i) {
if (i != DC_PRED && i != TM_PRED) {
- const uint8_t index = mode_to_angle_bin[i];
- uint64_t score = 2 * hist[index];
+ const uint8_t angle_bin = mode_to_angle_bin[i];
+ uint64_t score = 2 * hist[angle_bin];
int weight = 2;
- if (index > 0) {
- score += hist[index - 1];
- weight += 1;
+ if (angle_bin > 0) {
+ score += hist[angle_bin - 1];
+ ++weight;
}
- if (index < DIRECTIONAL_MODES - 1) {
- score += hist[index + 1];
- weight += 1;
+ if (angle_bin < DIRECTIONAL_MODES - 1) {
+ score += hist[angle_bin + 1];
+ ++weight;
}
if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
directional_mode_skip_mask[i] = 1;
@@ -3654,8 +3647,7 @@
if (colors > 1 && colors <= 64) {
int r, c, n, i, j;
const int max_itr = 50;
- int color_ctx, color_idx = 0;
- int color_order[PALETTE_MAX_SIZE];
+ uint8_t color_order[PALETTE_MAX_SIZE];
int64_t this_sse;
float lb_u, ub_u, val_u;
float lb_v, ub_v, val_v;
@@ -3748,13 +3740,9 @@
for (i = 0; i < rows; ++i) {
for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
- color_ctx = av1_get_palette_color_context(color_map, cols, i, j, n,
- color_order);
- for (r = 0; r < n; ++r)
- if (color_map[i * cols + j] == color_order[r]) {
- color_idx = r;
- break;
- }
+ int color_idx;
+ const int color_ctx = av1_get_palette_color_context(
+ color_map, cols, i, j, n, color_order, &color_idx);
assert(color_idx >= 0 && color_idx < n);
this_rate += cpi->palette_uv_color_cost[n - 2][color_ctx][color_idx];
}
@@ -9385,7 +9373,7 @@
int best_rate_nocoef;
#endif
int64_t distortion2 = 0, distortion_y = 0, dummy_rd = best_rd, this_rd;
- int skippable = 0;
+ int skippable = 0, rate_overhead = 0;
TX_SIZE best_tx_size, uv_tx;
TX_TYPE best_tx_type;
PALETTE_MODE_INFO palette_mode_info;
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index fd0f76b..67f4b5d 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -369,8 +369,8 @@
int rate = av1_cost_coeffs(cm, x, plane, block, pt, tx_size, scan_order->scan,
scan_order->neighbors, 0);
args->this_rate += rate;
- av1_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, blk_col,
- blk_row);
+ (void)plane_bsize;
+ av1_set_contexts(xd, pd, tx_size, p->eobs[block] > 0, blk_col, blk_row);
}
static void set_entropy_context_b(int plane, int block, int blk_row,
@@ -382,8 +382,8 @@
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *p = &x->plane[plane];
struct macroblockd_plane *pd = &xd->plane[plane];
- av1_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, blk_col,
- blk_row);
+ (void)plane_bsize;
+ av1_set_contexts(xd, pd, tx_size, p->eobs[block] > 0, blk_col, blk_row);
}
static INLINE void add_token(TOKENEXTRA **t, const aom_prob *context_tree,
@@ -410,18 +410,19 @@
}
#if CONFIG_PALETTE
-void av1_tokenize_palette_sb(const AV1_COMP *cpi, struct ThreadData *const td,
- int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
- BLOCK_SIZE bsize, int *rate) {
- MACROBLOCK *const x = &td->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- uint8_t *color_map = xd->plane[plane != 0].color_index_map;
- PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
- int n = pmi->palette_size[plane != 0];
- int i, j, k;
+void av1_tokenize_palette_sb(const AV1_COMP *cpi,
+ const struct ThreadData *const td, int plane,
+ TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ int *rate) {
+ const MACROBLOCK *const x = &td->mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const uint8_t *const color_map = xd->plane[plane != 0].color_index_map;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int n = pmi->palette_size[plane != 0];
+ int i, j;
int this_rate = 0;
- int color_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+ uint8_t color_order[PALETTE_MAX_SIZE];
const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
(xd->plane[plane != 0].subsampling_y);
const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
@@ -432,17 +433,13 @@
for (i = 0; i < rows; ++i) {
for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
- color_ctx =
- av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
- for (k = 0; k < n; ++k)
- if (color_map[i * cols + j] == color_order[k]) {
- color_idx = k;
- break;
- }
- assert(color_idx >= 0 && color_idx < n);
+ int color_new_idx;
+ const int color_ctx = av1_get_palette_color_context(
+ color_map, cols, i, j, n, color_order, &color_new_idx);
+ assert(color_new_idx >= 0 && color_new_idx < n);
if (dry_run == DRY_RUN_COSTCOEFFS)
- this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_idx];
- (*t)->token = color_idx;
+ this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_new_idx];
+ (*t)->token = color_new_idx;
(*t)->context_tree = probs[n - 2][color_ctx];
(*t)->skip_eob_node = 0;
++(*t);
@@ -501,6 +498,7 @@
int skip_eob = 0;
int16_t token;
EXTRABIT extra;
+ (void)plane_bsize;
pt = get_entropy_context(tx_size, pd->above_context + blk_col,
pd->left_context + blk_row);
scan = scan_order->scan;
@@ -535,7 +533,7 @@
*tp = t;
- av1_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, blk_col, blk_row);
+ av1_set_contexts(xd, pd, tx_size, c > 0, blk_col, blk_row);
}
struct is_skippable_args {
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index ae896a6..89610df 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -72,7 +72,7 @@
#endif
#if CONFIG_PALETTE
void av1_tokenize_palette_sb(const struct AV1_COMP *cpi,
- struct ThreadData *const td, int plane,
+ const struct ThreadData *const td, int plane,
TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
int *rate);
#endif // CONFIG_PALETTE
diff --git a/configure b/configure
index 99d2bb8..2659d37 100755
--- a/configure
+++ b/configure
@@ -616,20 +616,18 @@
check_add_cflags -Wvla
check_add_cflags -Wimplicit-function-declaration
check_add_cflags -Wuninitialized
- check_add_cflags -Wunused-variable
+ check_add_cflags -Wunused
check_add_cflags -Wsign-compare
+ # Enabling the following warning (in combination with -Wunused above)
+ # for C++ generates errors in third_party code including googletest and
+ # libyuv. So enable it only for C code.
+ check_cflags "-Wextra" && add_cflags_only "-Wextra"
# Enabling the following warning for C++ generates some useless warnings
# about some function parameters shadowing class member function names.
# So, only enable this warning for C code.
check_cflags "-Wshadow" && add_cflags_only "-Wshadow"
- case ${CC} in
- *clang*) ;;
- *) check_add_cflags -Wunused-but-set-variable ;;
- esac
if enabled mips || [ -z "${INLINE}" ]; then
enabled extra_warnings || check_add_cflags -Wno-unused-function
- else
- check_add_cflags -Wunused-function
fi
fi
diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index fdb9739..6beb4fb 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c
@@ -191,8 +191,7 @@
}
static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
- aom_codec_enc_cfg_t *cfg, unsigned int frame_out,
- int *mismatch_seen) {
+ unsigned int frame_out, int *mismatch_seen) {
aom_image_t enc_img, dec_img;
struct av1_ref_frame ref_enc, ref_dec;
@@ -226,11 +225,10 @@
aom_img_free(&dec_img);
}
-static int encode_frame(aom_codec_ctx_t *ecodec, aom_codec_enc_cfg_t *cfg,
- aom_image_t *img, unsigned int frame_in,
- AvxVideoWriter *writer, int test_decode,
- aom_codec_ctx_t *dcodec, unsigned int *frame_out,
- int *mismatch_seen) {
+static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
+ unsigned int frame_in, AvxVideoWriter *writer,
+ int test_decode, aom_codec_ctx_t *dcodec,
+ unsigned int *frame_out, int *mismatch_seen) {
int got_pkts = 0;
aom_codec_iter_t iter = NULL;
const aom_codec_cx_pkt_t *pkt = NULL;
@@ -271,7 +269,7 @@
// Mismatch checking
if (got_data && test_decode) {
- testing_decode(ecodec, dcodec, cfg, *frame_out, mismatch_seen);
+ testing_decode(ecodec, dcodec, *frame_out, mismatch_seen);
}
return got_pkts;
@@ -280,12 +278,12 @@
int main(int argc, char **argv) {
FILE *infile = NULL;
// Encoder
- aom_codec_ctx_t ecodec = { 0 };
- aom_codec_enc_cfg_t cfg = { 0 };
+ aom_codec_ctx_t ecodec;
+ aom_codec_enc_cfg_t cfg;
unsigned int frame_in = 0;
aom_image_t raw;
aom_codec_err_t res;
- AvxVideoInfo info = { 0 };
+ AvxVideoInfo info;
AvxVideoWriter *writer = NULL;
const AvxInterface *encoder = NULL;
@@ -311,6 +309,12 @@
unsigned int limit = 0;
exec_name = argv[0];
+ // Clear explicitly, as simply assigning "{ 0 }" generates
+ // "missing-field-initializers" warning in some compilers.
+ memset(&ecodec, 0, sizeof(ecodec));
+ memset(&cfg, 0, sizeof(cfg));
+ memset(&info, 0, sizeof(info));
+
if (argc < 7) die("Invalid number of arguments");
codec_arg = argv[1];
@@ -404,7 +408,7 @@
}
}
- encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode, &dcodec,
+ encode_frame(&ecodec, &raw, frame_in, writer, test_decode, &dcodec,
&frame_out, &mismatch_seen);
frame_in++;
if (mismatch_seen) break;
@@ -412,8 +416,8 @@
// Flush encoder.
if (!mismatch_seen)
- while (encode_frame(&ecodec, &cfg, NULL, frame_in, writer, test_decode,
- &dcodec, &frame_out, &mismatch_seen)) {
+ while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec,
+ &frame_out, &mismatch_seen)) {
}
printf("\n");
diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index 069e35e..1abeb27 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c
@@ -63,13 +63,17 @@
int frame_count = 0;
aom_image_t raw;
aom_codec_err_t res;
- AvxVideoInfo info = { 0 };
+ AvxVideoInfo info;
AvxVideoWriter *writer = NULL;
const AvxInterface *encoder = NULL;
const int fps = 30;
exec_name = argv[0];
+ // Clear explicitly, as simply assigning "{ 0 }" generates
+ // "missing-field-initializers" warning in some compilers.
+ memset(&info, 0, sizeof(info));
+
if (argc < 5) die("Invalid number of arguments");
encoder = get_aom_encoder_by_name("av1");
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 418757d..1d2b51e 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -151,7 +151,7 @@
int frame_count = 0;
aom_image_t raw;
aom_codec_err_t res;
- AvxVideoInfo info = { 0 };
+ AvxVideoInfo info;
AvxVideoWriter *writer = NULL;
const AvxInterface *encoder = NULL;
const int fps = 30;
@@ -168,6 +168,10 @@
exec_name = argv[0];
+ // Clear explicitly, as simply assigning "{ 0 }" generates
+ // "missing-field-initializers" warning in some compilers.
+ memset(&info, 0, sizeof(info));
+
if (argc != 9) die("Invalid number of arguments");
codec_arg = argv[1];
diff --git a/test/av1_convolve_optimz_test.cc b/test/av1_convolve_optimz_test.cc
index b83ae94..b891e99 100644
--- a/test/av1_convolve_optimz_test.cc
+++ b/test/av1_convolve_optimz_test.cc
@@ -54,7 +54,6 @@
const size_t maxBlockSize = maxWidth * maxHeight;
const int horizOffset = 32;
const int vertiOffset = 32;
-const size_t testMaxBlk = 128;
const int stride = 128;
const int x_step_q4 = 16;
@@ -90,7 +89,7 @@
void RunVertFilterBitExactCheck();
private:
- void PrepFilterBuffer(int w, int h);
+ void PrepFilterBuffer();
void DiffFilterBuffer();
conv_filter_t conv_horiz_;
conv_filter_t conv_vert_;
@@ -106,7 +105,7 @@
int avg_;
};
-void AV1ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+void AV1ConvolveOptimzTest::PrepFilterBuffer() {
int r, c;
ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -150,7 +149,7 @@
}
void AV1ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
- PrepFilterBuffer(testMaxBlk, testMaxBlk);
+ PrepFilterBuffer();
InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
@@ -167,7 +166,7 @@
// and test again.
int intermediate_height =
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
- PrepFilterBuffer(testMaxBlk, testMaxBlk);
+ PrepFilterBuffer();
av1_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
intermediate_height, filter_params, subpel_, x_step_q4,
@@ -180,7 +179,7 @@
}
void AV1ConvolveOptimzTest::RunVertFilterBitExactCheck() {
- PrepFilterBuffer(testMaxBlk, testMaxBlk);
+ PrepFilterBuffer();
InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
@@ -266,7 +265,7 @@
void RunVertFilterBitExactCheck();
private:
- void PrepFilterBuffer(int w, int h);
+ void PrepFilterBuffer();
void DiffFilterBuffer();
hbd_conv_filter_t conv_horiz_;
hbd_conv_filter_t conv_vert_;
@@ -283,7 +282,7 @@
int bit_depth_;
};
-void AV1HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+void AV1HbdConvolveOptimzTest::PrepFilterBuffer() {
int r, c;
ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -326,7 +325,7 @@
}
void AV1HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
- PrepFilterBuffer(testMaxBlk, testMaxBlk);
+ PrepFilterBuffer();
InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
@@ -344,7 +343,7 @@
// and test again.
int intermediate_height =
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
- PrepFilterBuffer(testMaxBlk, testMaxBlk);
+ PrepFilterBuffer();
av1_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
intermediate_height, filter_params, subpel_,
@@ -357,7 +356,7 @@
}
void AV1HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
- PrepFilterBuffer(testMaxBlk, testMaxBlk);
+ PrepFilterBuffer();
InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
diff --git a/test/codec_factory.h b/test/codec_factory.h
index c92d5c1..b645102 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -123,6 +123,9 @@
#if CONFIG_AV1_DECODER
return new AV1Decoder(cfg, flags, deadline);
#else
+ (void)cfg;
+ (void)flags;
+ (void)deadline;
return NULL;
#endif
}
@@ -134,6 +137,10 @@
#if CONFIG_AV1_ENCODER
return new AV1Encoder(cfg, deadline, init_flags, stats);
#else
+ (void)cfg;
+ (void)deadline;
+ (void)init_flags;
+ (void)stats;
return NULL;
#endif
}
@@ -143,6 +150,8 @@
#if CONFIG_AV1_ENCODER
return aom_codec_enc_config_default(&aom_codec_av1_cx_algo, cfg, usage);
#else
+ (void)cfg;
+ (void)usage;
return AOM_CODEC_INCAPABLE;
#endif
}
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index e73daa5..9811955 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -264,12 +264,12 @@
}
void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
- int tx_type) {
+ int /*tx_type*/) {
idct16x16_10(in, out, stride);
}
void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
- int tx_type) {
+ int /*tx_type*/) {
idct16x16_12(in, out, stride);
}
@@ -727,7 +727,7 @@
virtual void TearDown() { libaom_test::ClearSystemState(); }
protected:
- void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+ void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride);
}
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 95a0eb5..7adb9d6 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -92,7 +92,7 @@
aom_codec_err_t res_dec =
decoder->DecodeFrame(video->cxdata(), video->frame_size());
- if (!HandleDecodeResult(res_dec, *video, decoder)) break;
+ if (!HandleDecodeResult(res_dec, decoder)) break;
} else {
// Signal end of the file to the decoder.
const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index aabca40..b8f8d1a 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -141,7 +141,6 @@
// Hook to be called to handle decode result. Return true to continue.
virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- const CompressedVideoSource & /*video*/,
Decoder *decoder) {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
return AOM_CODEC_OK == res_dec;
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index c1a0cb7..092e669 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -275,7 +275,7 @@
aom_codec_err_t res_dec = decoder->DecodeFrame(
(const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
- if (!HandleDecodeResult(res_dec, *video, decoder.get())) break;
+ if (!HandleDecodeResult(res_dec, decoder.get())) break;
has_dxdata = true;
}
@@ -293,7 +293,7 @@
// Flush the decoder when there are no more fragments.
if ((init_flags_ & AOM_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
- if (!HandleDecodeResult(res_dec, *video, decoder.get())) break;
+ if (!HandleDecodeResult(res_dec, decoder.get())) break;
}
if (has_dxdata && has_cxdata) {
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 11c387a..45a080e 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -228,7 +228,6 @@
// Hook to be called to handle decode result. Return true to continue.
virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- const VideoSource & /*video*/,
Decoder *decoder) {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
return AOM_CODEC_OK == res_dec;
diff --git a/test/encoder_parms_get_to_decoder.cc b/test/encoder_parms_get_to_decoder.cc
index 640e12f..52d68b1 100644
--- a/test/encoder_parms_get_to_decoder.cc
+++ b/test/encoder_parms_get_to_decoder.cc
@@ -94,7 +94,6 @@
}
virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- const libaom_test::VideoSource & /*video*/,
libaom_test::Decoder *decoder) {
aom_codec_ctx_t *const av1_decoder = decoder->GetDecoder();
aom_codec_alg_priv_t *const priv =
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 951c47f..07b6039 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -55,8 +55,7 @@
nframes_++;
}
- virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
- ::libaom_test::Encoder * /*encoder*/) {
+ virtual void PreEncodeFrameHook(libaom_test::VideoSource *video) {
frame_flags_ &=
~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF);
if (droppable_nframes_ > 0 &&
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index f1fad70..25b8718 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -28,7 +28,6 @@
}
virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- const libaom_test::VideoSource & /*video*/,
libaom_test::Decoder *decoder) {
EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
return !::testing::Test::HasFailure();
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 5ff5090..7848e20 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -101,8 +101,7 @@
}
RoundHighBitDepth(bit_depth, &se, &sse);
*sse_ptr = static_cast<uint32_t>(sse);
- return static_cast<uint32_t>(
- sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+ return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
}
/* The subpel reference functions differ from the codec version in one aspect:
@@ -157,8 +156,7 @@
}
RoundHighBitDepth(bit_depth, &se, &sse);
*sse_ptr = static_cast<uint32_t>(sse);
- return static_cast<uint32_t>(
- sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+ return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
}
static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
@@ -211,8 +209,7 @@
}
RoundHighBitDepth(bit_depth, &se, &sse);
*sse_ptr = static_cast<uint32_t>(sse);
- return static_cast<uint32_t>(
- sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+ return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/webmenc.cc b/webmenc.cc
index f78f027..e3d209a 100644
--- a/webmenc.cc
+++ b/webmenc.cc
@@ -24,7 +24,6 @@
void write_webm_file_header(struct WebmOutputContext *webm_ctx,
const aom_codec_enc_cfg_t *cfg,
- const struct aom_rational *fps,
stereo_format_t stereo_fmt, unsigned int fourcc,
const struct AvxRational *par) {
mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);
diff --git a/webmenc.h b/webmenc.h
index 90211ff..74387fb 100644
--- a/webmenc.h
+++ b/webmenc.h
@@ -40,7 +40,6 @@
void write_webm_file_header(struct WebmOutputContext *webm_ctx,
const aom_codec_enc_cfg_t *cfg,
- const struct aom_rational *fps,
stereo_format_t stereo_fmt, unsigned int fourcc,
const struct AvxRational *par);