implement combined parallel_deblocking experiment

The parallel_deblocking experiment is proposed jointly by Intel
and Microsoft. The following changes are implemented in this
experiment:

- deblocking filter order is changed to filter all vertical edges
  of the whole frame followed by filtering all horizontal edges
  of the whole frame

- filter length decision is made based on the transform block size
  on both sides of the edge. block with smaller transform size
  determines the final filter length.

- transform blocks on both sides of the edge are checked, only when
  both blocks are skipped and they belong to the same prediction
  block, filtering of that edge can be skipped.

- 15-tap filter and extended flat area detection are removed.

- special rule for handling 4x4 transform block on the super block
  boundary in VP9 is removed.

Change-Id: I1aa82c6b5335d47c2f73eec8fc8bee2c08a1cf74
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index c5054b5..27b7a3f 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -30,8 +30,17 @@
   }
 }
 #endif
-
+#if CONFIG_PARALLEL_DEBLOCKING
 // should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+                                  uint8_t p0, uint8_t q0, uint8_t q1) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
                                  uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
                                  uint8_t q1, uint8_t q2, uint8_t q3) {
@@ -118,10 +127,16 @@
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint8_t p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
     ++s;
   }
@@ -142,10 +157,16 @@
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint8_t p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
     s += pitch;
   }
@@ -351,6 +372,21 @@
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_PARALLEL_DEBLOCKING
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+                                         uint16_t p1, uint16_t p0, uint16_t q0,
+                                         uint16_t q1, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
                                         uint16_t p3, uint16_t p2, uint16_t p1,
@@ -449,6 +485,7 @@
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p3 = s[-4 * p];
     const uint16_t p2 = s[-3 * p];
     const uint16_t p1 = s[-2 * p];
@@ -459,6 +496,14 @@
     const uint16_t q3 = s[3 * p];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
     ++s;
   }
@@ -480,10 +525,17 @@
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint16_t p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
     s += pitch;
   }
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 2630b51..7e134dc 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -19,6 +19,40 @@
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
+#if CONFIG_PARALLEL_DEBLOCKING
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK4                                                      \
+  do {                                                                        \
+    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
+    __m128i flat = abs_diff(q1p1, q0p0);                                      \
+    /* abs(p1 - q1), abs(p0 - q0) */                                          \
+    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
+    __m128i abs_p0q0, abs_p1q1;                                               \
+                                                                              \
+    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
+    hev =                                                                     \
+        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
+    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
+    hev = _mm_packs_epi16(hev, hev);                                          \
+                                                                              \
+    /* const int8_t mask = filter_mask2(*limit, *blimit, */                   \
+    /*                                  p1, p0, q0, q1); */                   \
+    abs_p0q0 =                                                                \
+        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
+    abs_p1q1 =                                                                \
+        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
+    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
+    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
+    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
+    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
+    mask = _mm_unpacklo_epi64(mask, flat);                                    \
+    mask = _mm_subs_epu8(mask, limit);                                        \
+    mask = _mm_cmpeq_epi8(mask, zero);                                        \
+    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
+  } while (0)
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 // filter_mask and hev_mask
 #define FILTER_HEV_MASK                                                       \
   do {                                                                        \
@@ -114,23 +148,34 @@
   const __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+#if !CONFIG_PARALLEL_DEBLOCKING
+  __m128i p3p2, p2p1, q3q2, q2q1;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
-
+#if !CONFIG_PARALLEL_DEBLOCKING
   p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
                             _mm_loadl_epi64((__m128i *)(s - 4 * p)));
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
                             _mm_loadl_epi64((__m128i *)(s + 0 * p)));
+#if !CONFIG_PARALLEL_DEBLOCKING
   q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+#if !CONFIG_PARALLEL_DEBLOCKING
+  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+#if !CONFIG_PARALLEL_DEBLOCKING
   FILTER_HEV_MASK;
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  FILTER_HEV_MASK4;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   FILTER4;
 
   _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
@@ -150,7 +195,10 @@
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
   __m128i x0, x1, x2, x3;
-  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+#if !CONFIG_PARALLEL_DEBLOCKING
+  __m128i p3p2, p2p1, q3q2, q2q1;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
 
   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
@@ -174,29 +222,40 @@
   p1p0 = _mm_unpacklo_epi16(q1q0, x1);
   // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
   x0 = _mm_unpacklo_epi16(x2, x3);
+#if !CONFIG_PARALLEL_DEBLOCKING
   // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
   p3p2 = _mm_unpacklo_epi32(p1p0, x0);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
   p1p0 = _mm_unpackhi_epi32(p1p0, x0);
+#if !CONFIG_PARALLEL_DEBLOCKING
   p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
 
   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
   q1q0 = _mm_unpackhi_epi16(q1q0, x1);
   // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
   x2 = _mm_unpackhi_epi16(x2, x3);
+#if !CONFIG_PARALLEL_DEBLOCKING
   // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
   q3q2 = _mm_unpackhi_epi32(q1q0, x2);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
   q1q0 = _mm_unpacklo_epi32(q1q0, x2);
 
   q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
   q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+#if !CONFIG_PARALLEL_DEBLOCKING
   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+#if !CONFIG_PARALLEL_DEBLOCKING
   FILTER_HEV_MASK;
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  FILTER_HEV_MASK4;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   FILTER4;
 
   // Transpose 8x4 to 4x8
@@ -1395,18 +1454,23 @@
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
                          _mm_load_si128((const __m128i *)_thresh1));
   const __m128i zero = _mm_set1_epi16(0);
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+#if !CONFIG_PARALLEL_DEBLOCKING
+  __m128i p3, p2, q2, q3;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+  __m128i p1, p0, q0, q1;
   __m128i mask, hev, flat;
-
+#if !CONFIG_PARALLEL_DEBLOCKING
   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+#if !CONFIG_PARALLEL_DEBLOCKING
   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   // filter_mask and hev_mask
   {
     const __m128i abs_p1p0 =
@@ -1419,8 +1483,9 @@
         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
     __m128i abs_p1q1 =
         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+#if !CONFIG_PARALLEL_DEBLOCKING
     __m128i work;
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -1431,6 +1496,7 @@
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(flat, mask);
+#if !CONFIG_PARALLEL_DEBLOCKING
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
     work = _mm_max_epu8(
@@ -1441,6 +1507,7 @@
         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
     mask = _mm_max_epu8(work, mask);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
@@ -1584,6 +1651,51 @@
   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
 }
 
+#if CONFIG_PARALLEL_DEBLOCKING
+#define movq(p) _mm_loadl_epi64((const __m128i *)(p))
+#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
+#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
+#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
+#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
+#define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
+enum { ROTATE_DWORD_RIGHT = 0x39 };
+static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
+                                 const uint8_t *pSrc,
+                                 const ptrdiff_t srcStride) {
+  for (uint32_t idx = 0; idx < 2; idx += 1) {
+    __m128i r0, r1, r2, r3;
+    // load data
+    r0 = movq(pSrc);
+    r1 = movq(pSrc + srcStride);
+    r2 = movq(pSrc + srcStride * 2);
+    r3 = movq(pSrc + srcStride * 3);
+    // transpose
+    r0 = punpcklbw(r0, r1);
+    r2 = punpcklbw(r2, r3);
+    r1 = punpckhwd(r0, r2);
+    r0 = punpcklwd(r0, r2);
+    // store data
+    movd(pDst, r0);
+    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride, r0);
+    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 2, r0);
+    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 3, r0);
+    movd(pDst + dstStride * 4, r1);
+    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 5, r1);
+    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 6, r1);
+    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 7, r1);
+    // advance the pointers
+    pDst += dstStride * 8;
+    pSrc += 8;
+  }
+}
+
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 static INLINE void transpose(unsigned char *src[], int in_p,
                              unsigned char *dst[], int out_p,
                              int num_8x8_to_transpose) {
@@ -1663,15 +1775,17 @@
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+#if !CONFIG_PARALLEL_DEBLOCKING
   unsigned char *src[2];
   unsigned char *dst[2];
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
   // Transpose 8x16
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
   aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
+#if !CONFIG_PARALLEL_DEBLOCKING
   src[0] = t_dst;
   src[1] = t_dst + 8;
   dst[0] = s - 4;
@@ -1679,6 +1793,9 @@
 
   // Transpose back
   transpose(src, 16, dst, p, 2);
+#else  // CONFIG_PARALLEL_DEBLOCKING
+  transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
 }
 
 void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index e703ca0..49ee48c 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -1938,6 +1938,345 @@
   dst->buf = dst0;
 }
 
+#if CONFIG_PARALLEL_DEBLOCKING
+
+typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+
+static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES] = {
+  // mask for vertical edges filtering
+  {
+#if CONFIG_CB4X4
+      2 - 1,   // BLOCK_2X2
+      2 - 1,   // BLOCK_2X4
+      4 - 1,   // BLOCK_4X2
+#endif         // CONFIG_CB4X4
+      4 - 1,   // BLOCK_4X4
+      4 - 1,   // BLOCK_4X8
+      8 - 1,   // BLOCK_8X4
+      8 - 1,   // BLOCK_8X8
+      8 - 1,   // BLOCK_8X16
+      16 - 1,  // BLOCK_16X8
+      16 - 1,  // BLOCK_16X16
+      16 - 1,  // BLOCK_16X32
+      32 - 1,  // BLOCK_32X16
+      32 - 1,  // BLOCK_32X32
+      32 - 1,  // BLOCK_32X64
+      64 - 1,  // BLOCK_64X32
+      64 - 1,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+      64 - 1,   // BLOCK_64X128
+      128 - 1,  // BLOCK_128X64
+      128 - 1   // BLOCK_128X128
+#endif          // CONFIG_EXT_PARTITION
+  },
+
+  // mask for horizontal edges filtering
+  {
+#if CONFIG_CB4X4
+      2 - 1,   // BLOCK_2X2
+      4 - 1,   // BLOCK_2X4
+      2 - 1,   // BLOCK_4X2
+#endif         // CONFIG_CB4X4
+      4 - 1,   // BLOCK_4X4
+      8 - 1,   // BLOCK_4X8
+      4 - 1,   // BLOCK_8X4
+      8 - 1,   // BLOCK_8X8
+      16 - 1,  // BLOCK_8X16
+      8 - 1,   // BLOCK_16X8
+      16 - 1,  // BLOCK_16X16
+      32 - 1,  // BLOCK_16X32
+      16 - 1,  // BLOCK_32X16
+      32 - 1,  // BLOCK_32X32
+      64 - 1,  // BLOCK_32X64
+      32 - 1,  // BLOCK_64X32
+      64 - 1,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+      128 - 1,  // BLOCK_64X128
+      64 - 1,   // BLOCK_128X64
+      128 - 1   // BLOCK_128X128
+#endif          // CONFIG_EXT_PARTITION
+  },
+};
+
+static const uint32_t av1_transform_masks[NUM_EDGE_DIRS][TX_SIZES_ALL] = {
+  {
+#if CONFIG_CB4X4
+      2 - 1,  // TX_2X2
+#endif
+      4 - 1,   // TX_4X4
+      8 - 1,   // TX_8X8
+      16 - 1,  // TX_16X16
+      32 - 1,  // TX_32X32
+#if CONFIG_TX64X64
+      64 - 1,  // TX_64X64
+#endif         // CONFIG_TX64X64
+      4 - 1,   // TX_4X8
+      8 - 1,   // TX_8X4
+      8 - 1,   // TX_8X16
+      16 - 1,  // TX_16X8
+      16 - 1,  // TX_16X32
+      32 - 1,  // TX_32X16
+      4 - 1,   // TX_4X16
+      16 - 1,  // TX_16X4
+      8 - 1,   // TX_8X32
+      32 - 1   // TX_32X8
+  },
+
+  {
+#if CONFIG_CB4X4
+      2 - 1,  // TX_2X2
+#endif
+      4 - 1,   // TX_4X4
+      8 - 1,   // TX_8X8
+      16 - 1,  // TX_16X16
+      32 - 1,  // TX_32X32
+#if CONFIG_TX64X64
+      64 - 1,  // TX_64X64
+#endif         // CONFIG_TX64X64
+      8 - 1,   // TX_4X8
+      4 - 1,   // TX_8X4
+      16 - 1,  // TX_8X16
+      8 - 1,   // TX_16X8
+      32 - 1,  // TX_16X32
+      16 - 1,  // TX_32X16
+      16 - 1,  // TX_4X16
+      4 - 1,   // TX_16X4
+      32 - 1,  // TX_8X32
+      8 - 1    // TX_32X8
+  }
+};
+
+static TX_SIZE av1_get_transform_size(const MODE_INFO *const pCurr,
+                                      const EDGE_DIR edgeDir,
+                                      const uint32_t scaleHorz,
+                                      const uint32_t scaleVert) {
+  const BLOCK_SIZE bs = pCurr->mbmi.sb_type;
+  TX_SIZE txSize;
+
+  // since in case of chrominance or non-square transorm need to convert
+  // transform size into transform size in particular direction.
+
+  txSize = uv_txsize_lookup[bs][pCurr->mbmi.tx_size][scaleHorz][scaleVert];
+
+  if (VERT_EDGE == edgeDir) {
+    txSize = txsize_horz_map[txSize];
+  } else {
+    txSize = txsize_vert_map[txSize];
+  }
+
+  return txSize;
+}
+
+typedef struct AV1_DEBLOCKING_PARAMETERS {
+  // length of the filter applied to the outer edge
+  uint32_t filterLength;
+  // length of the filter applied to the inner edge
+  uint32_t filterLengthInternal;
+
+  // deblocking limits
+  const uint8_t *lim;
+  const uint8_t *mblim;
+  const uint8_t *hev_thr;
+} AV1_DEBLOCKING_PARAMETERS;
+
+static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
+                               const MODE_INFO **const ppCurr,
+                               const ptrdiff_t modeStep,
+                               const AV1_COMMON *const cm,
+                               const EDGE_DIR edgeDir, const uint32_t x,
+                               const uint32_t y, const uint32_t width,
+                               const uint32_t height, const uint32_t scaleHorz,
+                               const uint32_t scaleVert) {
+  // reset to initial values
+  pParams->filterLength = 0;
+  pParams->filterLengthInternal = 0;
+
+  // no deblocking is required
+  if ((width <= x) || (height <= y)) {
+    return;
+  }
+
+#if CONFIG_EXT_PARTITION
+  // not sure if changes are required.
+  assert(0 && "Not yet updated");
+#endif  // CONFIG_EXT_PARTITION
+
+  {
+    const TX_SIZE ts =
+        av1_get_transform_size(ppCurr[0], edgeDir, scaleHorz, scaleVert);
+    const uint32_t currLevel = get_filter_level(&cm->lf_info, &ppCurr[0]->mbmi);
+    const int currSkipped =
+        ppCurr[0]->mbmi.skip && is_inter_block(&ppCurr[0]->mbmi);
+    const uint32_t coord = (VERT_EDGE == edgeDir) ? (x) : (y);
+    uint32_t level = currLevel;
+
+    // prepare outer edge parameters. deblock the edge if it's an edge of a TU
+    if (coord) {
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+      if (!av1_disable_loopfilter_on_tile_boundary(cm) ||
+          ((VERT_EDGE == edgeDir) &&
+           (0 == (ppCurr[0]->mbmi.boundary_info & TILE_LEFT_BOUNDARY))) ||
+          ((HORZ_EDGE == edgeDir) &&
+           (0 == (ppCurr[0]->mbmi.boundary_info & TILE_ABOVE_BOUNDARY))))
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+      {
+        const int32_t tuEdge =
+            (coord & av1_transform_masks[edgeDir][ts]) ? (0) : (1);
+
+        if (tuEdge) {
+          const MODE_INFO *const pPrev = *(ppCurr - modeStep);
+          const TX_SIZE pvTs =
+              av1_get_transform_size(pPrev, edgeDir, scaleHorz, scaleVert);
+          const uint32_t pvLvl = get_filter_level(&cm->lf_info, &pPrev->mbmi);
+          const int pvSkip = pPrev->mbmi.skip && is_inter_block(&pPrev->mbmi);
+          const int32_t puEdge =
+              (coord &
+               av1_prediction_masks[edgeDir]
+                                   [ss_size_lookup[ppCurr[0]->mbmi.sb_type]
+                                                  [scaleHorz][scaleVert]])
+                  ? (0)
+                  : (1);
+
+          // if the current and the previous blocks are skipped,
+          // deblock the edge if the edge belongs to a PU's edge only.
+          if ((currLevel || pvLvl) && (!pvSkip || !currSkipped || puEdge)) {
+            pParams->filterLength = (TX_4X4 >= AOMMIN(ts, pvTs)) ? (4) : (8);
+            // update the level if the current block is skipped,
+            // but the previous one is not
+            level = (currLevel) ? (currLevel) : (pvLvl);
+          }
+        }
+      }
+
+      // prepare internal edge parameters
+      if (currLevel && !currSkipped) {
+        pParams->filterLengthInternal = (TX_4X4 >= ts) ? (4) : (0);
+      }
+
+      // prepare common parameters
+      if (pParams->filterLength || pParams->filterLengthInternal) {
+        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+
+        pParams->lim = limits->lim;
+        pParams->mblim = limits->mblim;
+        pParams->hev_thr = limits->hev_thr;
+      }
+    }
+  }
+}
+
+static void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+                                        const MACROBLOCKD_PLANE *const pPlane,
+                                        const MODE_INFO **ppModeInfo,
+                                        const ptrdiff_t modeStride,
+                                        const uint32_t cuX,
+                                        const uint32_t cuY) {
+  const uint32_t scaleHorz = pPlane->subsampling_x;
+  const uint32_t scaleVert = pPlane->subsampling_y;
+  const uint32_t width = pPlane->dst.width;
+  const uint32_t height = pPlane->dst.height;
+  uint8_t *const pDst = pPlane->dst.buf;
+  const int dstStride = pPlane->dst.stride;
+
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+    uint8_t *p = pDst + y * MI_SIZE * dstStride;
+
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+      const MODE_INFO **const pCurr =
+          ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      set_lpf_parameters(&params, pCurr, ((ptrdiff_t)1 << scaleHorz), cm,
+                         VERT_EDGE, cuX + x * MI_SIZE, cuY + y * MI_SIZE, width,
+                         height, scaleHorz, scaleVert);
+
+      switch (params.filterLength) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4(p, dstStride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8(p, dstStride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+
+        // no filtering
+        default: break;
+      }
+
+      // process the internal edge
+      if (params.filterLengthInternal) {
+        aom_lpf_vertical_4(p + 4, dstStride, params.mblim, params.lim,
+                           params.hev_thr);
+      }
+
+      // advance the destination pointer
+      p += 8;
+    }
+  }
+}
+
+static void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+                                        const MACROBLOCKD_PLANE *const pPlane,
+                                        const MODE_INFO **ppModeInfo,
+                                        const ptrdiff_t modeStride,
+                                        const uint32_t cuX,
+                                        const uint32_t cuY) {
+  const uint32_t scaleHorz = pPlane->subsampling_x;
+  const uint32_t scaleVert = pPlane->subsampling_y;
+  const uint32_t width = pPlane->dst.width;
+  const uint32_t height = pPlane->dst.height;
+  uint8_t *const pDst = pPlane->dst.buf;
+  const int dstStride = pPlane->dst.stride;
+
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+    uint8_t *p = pDst + y * MI_SIZE * dstStride;
+
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+      const MODE_INFO **const pCurr =
+          ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      set_lpf_parameters(&params, pCurr, (modeStride << scaleVert), cm,
+                         HORZ_EDGE, cuX + x * MI_SIZE, cuY + y * MI_SIZE, width,
+                         height, scaleHorz, scaleVert);
+
+      switch (params.filterLength) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4(p, dstStride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8(p, dstStride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+
+        // no filtering
+        default: break;
+      }
+
+      // process the internal edge
+      if (params.filterLengthInternal) {
+        aom_lpf_horizontal_4(p + 4 * dstStride, dstStride, params.mblim,
+                             params.lim, params.hev_thr);
+      }
+
+      // advance the destination pointer
+      p += 8;
+    }
+  }
+}
+
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only) {
@@ -1970,6 +2309,7 @@
 #else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
+#if !CONFIG_PARALLEL_DEBLOCKING
   enum lf_path path;
   LOOP_FILTER_MASK lfm;
 
@@ -1981,58 +2321,38 @@
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
+#endif
+
 #if CONFIG_PARALLEL_DEBLOCKING
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      int plane;
-
       av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-      // TODO(JBB): Make setup_mask work for non 420.
-      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;
 
-      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
-                                              mi_row, mi_col);
-            break;
-        }
+        av1_filter_block_plane_vert(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
       }
     }
   }
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      int plane;
-
       av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-      // TODO(JBB): Make setup_mask work for non 420.
-      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;
 
-      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
-                                              mi_row, mi_col);
-            break;
-        }
+        av1_filter_block_plane_horz(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
       }
     }
   }