implement combined parallel_deblocking experiment The parallel_deblocking experiment is proposed jointly by Intel and Microsoft. The following changes are implemented in this experiment: - deblocking filter order is changed to filter all vertical edges of the whole frame followed by filtering all horizontal edges of the whole frame - filter length decision is made based on the transform block size on both sides of the edge. block with smaller transform size determines the final filter length. - transform blocks on both sides of the edge are checked, only when both blocks are skipped and they belong to the same prediction block, filtering of that edge can be skipped. - 15-tap filter and extended flat area detection are removed. - special rule for handling 4x4 transform block on the super block boundary in VP9 is removed. Change-Id: I1aa82c6b5335d47c2f73eec8fc8bee2c08a1cf74

commit: 392d0ff726ce9fdf825b61a9b9b776b9448b5a87 [log] [tgz]
author: Ryan Lei <ryan.z.lei@intel.com> Thu Feb 09 13:05:42 2017 -0800
committer: Ryan Lei <ryan.z.lei@intel.com> Wed Mar 01 19:59:33 2017 +0000
tree: bcd61e57047b4899f448c5a1c6e9a6d7d5ed9515
parent: ab77e73b77571ebed608708465a1b1d4b848e5a6 [diff] [blame]
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index c5054b5..27b7a3f 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c

@@ -30,8 +30,17 @@
   }
 }
 #endif
-
+#if CONFIG_PARALLEL_DEBLOCKING
 // should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+                                  uint8_t p0, uint8_t q0, uint8_t q1) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
                                  uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
                                  uint8_t q1, uint8_t q2, uint8_t q3) {
@@ -118,10 +127,16 @@
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint8_t p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
     ++s;
   }
@@ -142,10 +157,16 @@
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint8_t p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
     s += pitch;
   }
@@ -351,6 +372,21 @@
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_PARALLEL_DEBLOCKING
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+                                         uint16_t p1, uint16_t p0, uint16_t q0,
+                                         uint16_t q1, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
                                         uint16_t p3, uint16_t p2, uint16_t p1,
@@ -449,6 +485,7 @@
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p3 = s[-4 * p];
     const uint16_t p2 = s[-3 * p];
     const uint16_t p1 = s[-2 * p];
@@ -459,6 +496,14 @@
     const uint16_t q3 = s[3 * p];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
     ++s;
   }
@@ -480,10 +525,17 @@
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint16_t p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
     s += pitch;
   }
commit	392d0ff726ce9fdf825b61a9b9b776b9448b5a87	[log] [tgz]
author	Ryan Lei <ryan.z.lei@intel.com>	Thu Feb 09 13:05:42 2017 -0800
committer	Ryan Lei <ryan.z.lei@intel.com>	Wed Mar 01 19:59:33 2017 +0000
tree	bcd61e57047b4899f448c5a1c6e9a6d7d5ed9515
parent	ab77e73b77571ebed608708465a1b1d4b848e5a6 [diff] [blame]