integrate parallel_deblocking with CB4x4

this change makes parallel deblocking experiment works with
cb4x4. the inner loop process every 4x4 block.

Change-Id: I86adb3d7b6d67a91ccc12aab29da9bfb8c522cf1
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index 7420174..7ea1e6b 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -149,10 +149,15 @@
                             const uint8_t *blimit, const uint8_t *limit,
                             const uint8_t *thresh) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
 #if !CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
@@ -179,10 +184,15 @@
 void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
 #if !CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
@@ -229,10 +239,15 @@
 void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
@@ -256,8 +271,13 @@
 void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
 
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask =
@@ -390,10 +410,15 @@
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int count) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int step = 4;
+#else
+  int step = 8;
+#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < step * count; ++i) {
     const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p],
                   p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p],
                   p1 = s[-2 * p], p0 = s[-p];
@@ -436,7 +461,11 @@
 
 void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+#else
   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+#endif
 }
 
 static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
@@ -478,7 +507,11 @@
 
 void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
+#else
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+#endif
 }
 
 void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
@@ -596,10 +629,15 @@
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
 #if !CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p3 = s[-4 * p];
     const uint16_t p2 = s[-3 * p];
@@ -636,10 +674,15 @@
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
 #if !CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
@@ -689,10 +732,15 @@
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int bd) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
     const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
@@ -718,8 +766,13 @@
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
 
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask =
@@ -813,10 +866,15 @@
                                             const uint8_t *thresh, int count,
                                             int bd) {
   int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int step = 4;
+#else
+  int step = 8;
+#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < step * count; ++i) {
     const uint16_t p3 = s[-4 * p];
     const uint16_t p2 = s[-3 * p];
     const uint16_t p1 = s[-2 * p];
@@ -852,7 +910,11 @@
                                          const uint8_t *blimit,
                                          const uint8_t *limit,
                                          const uint8_t *thresh, int bd) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+#else
   highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+#endif
 }
 
 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
@@ -888,13 +950,21 @@
 void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
+#else
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+#endif
 }
 
 void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+#else
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+#endif
 }
 #endif  // CONFIG_HIGHBITDEPTH
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index af919a8..4b27ae9 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -22,7 +22,7 @@
 
 #include "av1/common/seg_common.h"
 
-#define CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY 0
+#define PARALLEL_DEBLOCKING_15TAPLUMAONLY 1
 
 // 64 bit masks for left transform size. Each 1 represents a position where
 // we should apply a loop filter across the left border of an 8x8 block
@@ -1857,8 +1857,6 @@
   dst->buf = dst0;
 }
 
-#if !(CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
-      CONFIG_CB4X4)
 #if CONFIG_PARALLEL_DEBLOCKING
 typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
 static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES] = {
@@ -2010,10 +2008,17 @@
   // not sure if changes are required.
   assert(0 && "Not yet updated");
 #endif  // CONFIG_EXT_PARTITION
+
   {
     const TX_SIZE ts =
         av1_get_transform_size(ppCurr[0], edgeDir, scaleHorz, scaleVert);
+#if CONFIG_EXT_DELTA_Q
+    const uint32_t currLevel =
+        get_filter_level(cm, &cm->lf_info, &ppCurr[0]->mbmi);
+#else
     const uint32_t currLevel = get_filter_level(&cm->lf_info, &ppCurr[0]->mbmi);
+#endif  // CONFIG_EXT_DELTA_Q
+
     const int currSkipped =
         ppCurr[0]->mbmi.skip && is_inter_block(&ppCurr[0]->mbmi);
     const uint32_t coord = (VERT_EDGE == edgeDir) ? (x) : (y);
@@ -2034,7 +2039,13 @@
           const MODE_INFO *const pPrev = *(ppCurr - modeStep);
           const TX_SIZE pvTs =
               av1_get_transform_size(pPrev, edgeDir, scaleHorz, scaleVert);
+#if CONFIG_EXT_DELTA_Q
+          const uint32_t pvLvl =
+              get_filter_level(cm, &cm->lf_info, &pPrev->mbmi);
+#else
           const uint32_t pvLvl = get_filter_level(&cm->lf_info, &pPrev->mbmi);
+#endif  // CONFIG_EXT_DELTA_Q
+
           const int pvSkip = pPrev->mbmi.skip && is_inter_block(&pPrev->mbmi);
           const int32_t puEdge =
               (coord &
@@ -2046,7 +2057,7 @@
           // if the current and the previous blocks are skipped,
           // deblock the edge if the edge belongs to a PU's edge only.
           if ((currLevel || pvLvl) && (!pvSkip || !currSkipped || puEdge)) {
-#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
             const TX_SIZE minTs = AOMMIN(ts, pvTs);
             if (TX_4X4 >= minTs) {
               pParams->filterLength = 4;
@@ -2054,7 +2065,7 @@
               pParams->filterLength = 8;
             } else {
               pParams->filterLength = 16;
-#if CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+#if PARALLEL_DEBLOCKING_15TAPLUMAONLY
               // No wide filtering for chroma plane
               if (scaleHorz || scaleVert) {
                 pParams->filterLength = 8;
@@ -2064,7 +2075,7 @@
 #else
             pParams->filterLength = (TX_4X4 >= AOMMIN(ts, pvTs)) ? (4) : (8);
 
-#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
 
             // update the level if the current block is skipped,
             // but the previous one is not
@@ -2072,10 +2083,14 @@
           }
         }
       }
+
+#if !CONFIG_CB4X4
       // prepare internal edge parameters
       if (currLevel && !currSkipped) {
         pParams->filterLengthInternal = (TX_4X4 >= ts) ? (4) : (0);
       }
+#endif
+
       // prepare common parameters
       if (pParams->filterLength || pParams->filterLengthInternal) {
         const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
@@ -2093,15 +2108,21 @@
                                         const ptrdiff_t modeStride,
                                         const uint32_t cuX,
                                         const uint32_t cuY) {
+  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
+  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scaleHorz = pPlane->subsampling_x;
   const uint32_t scaleVert = pPlane->subsampling_y;
   const uint32_t width = pPlane->dst.width;
   const uint32_t height = pPlane->dst.height;
   uint8_t *const pDst = pPlane->dst.buf;
   const int dstStride = pPlane->dst.stride;
-  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += row_step) {
     uint8_t *p = pDst + y * MI_SIZE * dstStride;
-    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += col_step) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 trasnform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
       const MODE_INFO **const pCurr =
           ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
       AV1_DEBLOCKING_PARAMETERS params;
@@ -2112,31 +2133,59 @@
       switch (params.filterLength) {
         // apply 4-tap filtering
         case 4:
-          aom_lpf_vertical_4(p, dstStride, params.mblim, params.lim,
-                             params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_vertical_4_c(p, dstStride, params.mblim, params.lim,
+                                 params.hev_thr);
           break;
         // apply 8-tap filtering
         case 8:
-          aom_lpf_vertical_8(p, dstStride, params.mblim, params.lim,
-                             params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_vertical_8_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_vertical_8_c(p, dstStride, params.mblim, params.lim,
+                                 params.hev_thr);
           break;
-#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
         // apply 16-tap filtering
         case 16:
-          aom_lpf_vertical_16(p, dstStride, params.mblim, params.lim,
-                              params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                         params.mblim, params.lim,
+                                         params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_vertical_16_c(p, dstStride, params.mblim, params.lim,
+                                  params.hev_thr);
           break;
-#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
         // no filtering
         default: break;
       }
       // process the internal edge
       if (params.filterLengthInternal) {
-        aom_lpf_vertical_4(p + 4, dstStride, params.mblim, params.lim,
-                           params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+        if (cm->use_highbitdepth)
+          aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p + 4), dstStride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      cm->bit_depth);
+        else
+#endif  // CONFIG_HIGHBITDEPTH
+          aom_lpf_vertical_4_c(p + 4, dstStride, params.mblim, params.lim,
+                               params.hev_thr);
       }
       // advance the destination pointer
-      p += 8;
+      p += MI_SIZE;
     }
   }
 }
@@ -2147,15 +2196,21 @@
                                         const ptrdiff_t modeStride,
                                         const uint32_t cuX,
                                         const uint32_t cuY) {
+  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
+  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scaleHorz = pPlane->subsampling_x;
   const uint32_t scaleVert = pPlane->subsampling_y;
   const uint32_t width = pPlane->dst.width;
   const uint32_t height = pPlane->dst.height;
   uint8_t *const pDst = pPlane->dst.buf;
   const int dstStride = pPlane->dst.stride;
-  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += row_step) {
     uint8_t *p = pDst + y * MI_SIZE * dstStride;
-    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += col_step) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 trasnform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
       const MODE_INFO **const pCurr =
           ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
       AV1_DEBLOCKING_PARAMETERS params;
@@ -2166,45 +2221,74 @@
       switch (params.filterLength) {
         // apply 4-tap filtering
         case 4:
-          aom_lpf_horizontal_4(p, dstStride, params.mblim, params.lim,
-                               params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                          params.mblim, params.lim,
+                                          params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_horizontal_4_c(p, dstStride, params.mblim, params.lim,
+                                   params.hev_thr);
           break;
         // apply 8-tap filtering
         case 8:
-          aom_lpf_horizontal_8(p, dstStride, params.mblim, params.lim,
-                               params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_horizontal_8_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                          params.mblim, params.lim,
+                                          params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_horizontal_8_c(p, dstStride, params.mblim, params.lim,
+                                   params.hev_thr);
           break;
-#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
         // apply 16-tap filtering
         case 16:
-          aom_lpf_horizontal_edge_16(p, dstStride, params.mblim, params.lim,
-                                     params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_horizontal_edge_16_c(
+                CONVERT_TO_SHORTPTR(p), dstStride, params.mblim, params.lim,
+                params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_horizontal_edge_16_c(p, dstStride, params.mblim, params.lim,
+                                         params.hev_thr);
           break;
-#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
         // no filtering
         default: break;
       }
       // process the internal edge
       if (params.filterLengthInternal) {
-        aom_lpf_horizontal_4(p + 4 * dstStride, dstStride, params.mblim,
-                             params.lim, params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+        if (cm->use_highbitdepth)
+          aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p + 4 * dstStride),
+                                        dstStride, params.mblim, params.lim,
+                                        params.hev_thr, cm->bit_depth);
+        else
+#endif  // CONFIG_HIGHBITDEPTH
+          aom_lpf_horizontal_4_c(p + 4 * dstStride, dstStride, params.mblim,
+                                 params.lim, params.hev_thr);
       }
       // advance the destination pointer
-      p += 8;
+      p += MI_SIZE;
     }
   }
 }
 #endif  // CONFIG_PARALLEL_DEBLOCKING
-#endif
 
 void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only) {
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
-    CONFIG_CB4X4
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
 
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
+    CONFIG_CB4X4
+
+#if !CONFIG_PARALLEL_DEBLOCKING
 #if CONFIG_VAR_TX
   for (int i = 0; i < MAX_MB_PLANE; ++i)
     memset(cm->top_txfm_context[i], TX_32X32, cm->mi_cols << TX_UNIT_WIDE_LOG2);
@@ -2229,27 +2313,17 @@
       }
     }
   }
-#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  int mi_row, mi_col;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  enum lf_path path;
-  LOOP_FILTER_MASK lfm;
+#else
 
-  if (y_only)
-    path = LF_PATH_444;
-  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    path = LF_PATH_420;
-  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    path = LF_PATH_444;
-  else
-    path = LF_PATH_SLOW;
-#endif
-#if CONFIG_PARALLEL_DEBLOCKING
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  assert(0 && "Not yet updated. ToDo as next steps");
+#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      // filter all vertical edges in every 64x64 super block
       for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
         const int32_t scaleHorz = planes[planeIdx].subsampling_x;
         const int32_t scaleVert = planes[planeIdx].subsampling_y;
@@ -2264,6 +2338,42 @@
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      // filter all horizontal edges in every 64x64 super block
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;
+        av1_filter_block_plane_horz(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
+      }
+    }
+  }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
+#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_PARALLEL_DEBLOCKING
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      // filter all vertical edges in every 64x64 super block
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;
+        av1_filter_block_plane_vert(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
+      }
+    }
+  }
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      // filter all horizontal edges in every 64x64 super block
       for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
         const int32_t scaleHorz = planes[planeIdx].subsampling_x;
         const int32_t scaleVert = planes[planeIdx].subsampling_y;
@@ -2275,6 +2385,18 @@
     }
   }
 #else   // CONFIG_PARALLEL_DEBLOCKING
+  enum lf_path path;
+  LOOP_FILTER_MASK lfm;
+
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
diff --git a/configure b/configure
index 969f8f3..7e9c621 100755
--- a/configure
+++ b/configure
@@ -551,6 +551,11 @@
       soft_enable accounting
       soft_enable inspection
     fi
+    if enabled parallel_deblocking_15tap && ! enabled parallel_deblocking; then
+      log_echo "parallel_deblocking_15tap dependes on parallel_deblocking, so"
+      log_echo "enabling parallel_deblocking"
+      soft_enable parallel_deblocking
+    fi
 }
 
 process_targets() {