Add HBD data path for av1_block_error_avx2 - Add unit test for av1_block_error. - Fix av1_dist_block logic for calling av1_block_error. Change-Id: Id8a47ee113417360a29fc2334d9ca72b5793e2d7

commit: d61e608d7a40637a4e185a6d92fe9cde90be564d [log] [tgz]
author: Yi Luo <luoyi@google.com> Fri May 26 16:14:39 2017 -0700
committer: Yi Luo <luoyi@google.com> Wed Jun 07 23:46:30 2017 +0000
tree: 4e9eb508bddd4f0abf64ee0ddc87fdd352af85a0
parent: b91155a9c13484fd9b9adc033dd1908a41f12cff [diff]
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 19f4204..6be2be0 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -278,6 +278,7 @@
     # the transform coefficients are held in 32-bit
     # values, so the assembler code for  av1_block_error can no longer be used.
     add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/av1_block_error avx2/;
 
     add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
 
@@ -310,6 +311,7 @@
     # the transform coefficients are held in 32-bit
     # values, so the assembler code for  av1_block_error can no longer be used.
     add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/av1_block_error avx2/;
 
     add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/av1_quantize_fp sse2/;

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 396c0a0..8ca465c 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -1292,14 +1292,16 @@
                                    &this_sse) >>
                 shift;
 #endif  // CONFIG_HIGHBITDEPTH
-#elif CONFIG_HIGHBITDEPTH
-    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-    *out_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, bd) >>
-        shift;
-#else
-    *out_dist =
-        av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
+#else   // !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
+                                         &this_sse, xd->bd) >>
+                  shift;
+    else
+#endif
+      *out_dist =
+          av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
 #endif  // CONFIG_PVQ
     *out_sse = this_sse >> shift;
   } else {

diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
index ae733a1..20ba414 100644
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ b/av1/encoder/x86/error_intrin_avx2.c

@@ -14,7 +14,20 @@
 #include "./av1_rtcd.h"
 #include "aom/aom_integer.h"
 
-int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m256i *c) {
+  const tran_low_t *addr = coeff + offset;
+#if CONFIG_HIGHBITDEPTH
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+  const __m256i y = _mm256_packs_epi32(x0, x1);
+  *c = _mm256_permute4x64_epi64(y, 0xD8);
+#else
+  *c = _mm256_loadu_si256((const __m256i *)addr);
+#endif
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              intptr_t block_size, int64_t *ssz) {
   __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
   __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
@@ -22,16 +35,16 @@
   __m128i sse_reg128, ssz_reg128;
   int64_t sse;
   int i;
-  const __m256i zero_reg = _mm256_set1_epi16(0);
+  const __m256i zero_reg = _mm256_setzero_si256();
 
   // init sse and ssz registerd to zero
-  sse_reg = _mm256_set1_epi16(0);
-  ssz_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_setzero_si256();
+  ssz_reg = _mm256_setzero_si256();
 
   for (i = 0; i < block_size; i += 16) {
     // load 32 bytes from coeff and dqcoeff
-    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
-    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+    read_coeff(coeff, i, &coeff_reg);
+    read_coeff(dqcoeff, i, &dqcoeff_reg);
     // dqcoeff - coeff
     dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
     // madd (dqcoeff - coeff)
commit	d61e608d7a40637a4e185a6d92fe9cde90be564d	[log] [tgz]
author	Yi Luo <luoyi@google.com>	Fri May 26 16:14:39 2017 -0700
committer	Yi Luo <luoyi@google.com>	Wed Jun 07 23:46:30 2017 +0000
tree	4e9eb508bddd4f0abf64ee0ddc87fdd352af85a0
parent	b91155a9c13484fd9b9adc033dd1908a41f12cff [diff]