Bitmask patch 0: Store info for bitmask at decoder

Purpose: build bitmask for loop filtering to speed up the decoder.
Result: make decoder 6% faster, on single thread.

The flag LOOP_FILTER_BITMASK controls whether the new feature is used.
Now it is turned off, I will turn it on after all following CLs are
checked int. Please expect a few following up CLs.

This CL:
Store information that determines filter is applied or not at decoder.
Then build bitmask for the whole frame, after all tiles decoded.

Change-Id: I3902ed825b674cff8131d0e9cbc14df3f9a566f6
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index df6d4f8..0b6020a 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -68,23 +68,6 @@
 //    10101010|10101010
 //
 // A loopfilter should be applied to every other 4x4 horizontally.
-// TODO(chengchen): make these tables static
-const FilterMask left_txform_mask[TX_SIZES] = {
-  { { 0xffffffffffffffffULL,  // TX_4X4,
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-
-  { { 0x5555555555555555ULL,  // TX_8X8,
-      0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } },
-
-  { { 0x1111111111111111ULL,  // TX_16X16,
-      0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } },
-
-  { { 0x0101010101010101ULL,  // TX_32X32,
-      0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } },
-
-  { { 0x0001000100010001ULL,  // TX_64X64,
-      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
-};
 
 // 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
 // We use 4 uint64_t to represent the 256 bit.
@@ -113,92 +96,6 @@
 //    00000000|00000000
 //
 // A loopfilter should be applied to every other 4x4 horizontally.
-const FilterMask above_txform_mask[TX_SIZES] = {
-  { { 0xffffffffffffffffULL,  // TX_4X4
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-
-  { { 0x0000ffff0000ffffULL,  // TX_8X8
-      0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } },
-
-  { { 0x000000000000ffffULL,  // TX_16X16
-      0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } },
-
-  { { 0x000000000000ffffULL,  // TX_32X32
-      0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } },
-
-  { { 0x000000000000ffffULL,  // TX_64X64
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-};
-
-// 64 bit mask to shift and set for each prediction size. A bit is set for
-// each 4x4 block that would be in the top left most block of the given block
-// size in the 64x64 block.
-const FilterMask size_mask_y[BLOCK_SIZES_ALL] = {
-  { { 0x0000000000000001ULL,  // BLOCK_4X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000010001ULL,  // BLOCK_4X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000000003ULL,  // BLOCK_8X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000030003ULL,  // BLOCK_8X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0003000300030003ULL,  // BLOCK_8X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00000000000f000fULL,  // BLOCK_16X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000f000f000f000fULL,  // BLOCK_16X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000f000f000f000fULL,  // BLOCK_16X32
-      0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X32
-      0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X64
-      0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } },
-
-  { { 0xffffffffffffffffULL,  // BLOCK_64X32
-      0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0xffffffffffffffffULL,  // BLOCK_64X64
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-  // Y plane max coding block size is 128x128, but the codec divides it
-  // into 4 64x64 blocks.
-  // BLOCK_64X128
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-  // BLOCK_128X64
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-  // BLOCK_128X128
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-
-  { { 0x0001000100010001ULL,  // BLOCK_4X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000000000000000fULL,  // BLOCK_16X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0003000300030003ULL,  // BLOCK_8X32
-      0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000ff00ffULL,  // BLOCK_32X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000f000f000f000fULL,  // BLOCK_16X64
-      0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } },
-
-  { { 0xffffffffffffffffULL,  // BLOCK_64X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }
-};
 
 LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
                                      int mi_col) {
@@ -248,10 +145,10 @@
            SIMD_WIDTH);
   }
 }
-static uint8_t get_filter_level(const AV1_COMMON *cm,
-                                const loop_filter_info_n *lfi_n,
-                                const int dir_idx, int plane,
-                                const MB_MODE_INFO *mbmi) {
+
+uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
+                         const int dir_idx, int plane,
+                         const MB_MODE_INFO *mbmi) {
   const int segment_id = mbmi->segment_id;
   if (cm->delta_lf_present_flag) {
     int delta_lf;
@@ -389,7 +286,7 @@
 // After locating which uint64_t, mi_row % 4 is the
 // row offset, and each row has 16 = 1 << stride_log2 4x4 units.
 // Therefore, shift = (row << stride_log2) + mi_col;
-static int get_index_shift(int mi_col, int mi_row, int *index) {
+int get_index_shift(int mi_col, int mi_row, int *index) {
   // *index = mi_row >> 2;
   // rows = mi_row % 4;
   // stride_log2 = 4;
@@ -599,11 +496,12 @@
           const TX_SIZE prev_tx_size =
               plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
                     : mbmi_prev->tx_size;
-          const TX_SIZE min_tx_size =
-              (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size],
-                                          txsize_horz_map[prev_tx_size])
-                                 : AOMMIN(txsize_vert_map[tx_size],
-                                          txsize_vert_map[prev_tx_size]);
+          TX_SIZE min_tx_size = (dir == VERT_EDGE)
+                                    ? AOMMIN(txsize_horz_map[tx_size],
+                                             txsize_horz_map[prev_tx_size])
+                                    : AOMMIN(txsize_vert_map[tx_size],
+                                             txsize_vert_map[prev_tx_size]);
+          min_tx_size = AOMMIN(min_tx_size, TX_16X16);
           assert(min_tx_size < TX_SIZES);
           const int row = r % MI_SIZE_64X64;
           const int col = c % MI_SIZE_64X64;
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 1229c0d..7f4ade7 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -67,6 +67,14 @@
   // V plane vertical edge and horizontal edge filter level
   uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
   uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // other info
+  FilterMask skip;
+  FilterMask is_vert_border;
+  FilterMask is_horz_border;
+  // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
+  FilterMask tx_size_ver[2][5];
+  FilterMask tx_size_hor[2][5];
 } LoopFilterMask;
 #endif  // LOOP_FILTER_BITMASK
 
@@ -119,9 +127,15 @@
 void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
                                 int plane_end);
 
+#if LOOP_FILTER_BITMASK
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                            struct macroblockd *mbd, int plane_start,
                            int plane_end, int partial_frame);
+#else
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *mbd, int plane_start,
+                           int plane_end, int partial_frame);
+#endif
 
 void av1_filter_block_plane_vert(const struct AV1Common *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
@@ -142,6 +156,9 @@
   MACROBLOCKD *xd;
 } LFWorkerData;
 
+uint8_t get_filter_level(const struct AV1Common *cm,
+                         const loop_filter_info_n *lfi_n, const int dir_idx,
+                         int plane, const MB_MODE_INFO *mbmi);
 #if LOOP_FILTER_BITMASK
 void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
                        int plane, int subsampling_x, int subsampling_y,
@@ -154,6 +171,43 @@
 void av1_filter_block_plane_hor(struct AV1Common *const cm,
                                 struct macroblockd_plane *const plane, int pl,
                                 int mi_row, int mi_col);
+LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
+                                     int mi_row, int mi_col);
+int get_index_shift(int mi_col, int mi_row, int *index);
+
+static const FilterMask left_txform_mask[TX_SIZES] = {
+  { { 0x0000000000000001ULL,  // TX_4X4,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0000000000010001ULL,  // TX_8X8,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_16X16,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_32X32,
+      0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_64X64,
+      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
+
+static const uint64_t above_txform_mask[2][TX_SIZES] = {
+  {
+      0x0000000000000001ULL,  // TX_4X4
+      0x0000000000000003ULL,  // TX_8X8
+      0x000000000000000fULL,  // TX_16X16
+      0x00000000000000ffULL,  // TX_32X32
+      0x000000000000ffffULL,  // TX_64X64
+  },
+  {
+      0x0000000000000001ULL,  // TX_4X4
+      0x0000000000000005ULL,  // TX_8X8
+      0x0000000000000055ULL,  // TX_16X16
+      0x0000000000005555ULL,  // TX_32X32
+      0x0000000055555555ULL,  // TX_64X64
+  },
+};
 #endif
 
 #ifdef __cplusplus
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 03cedef..2cfaf37 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1329,6 +1329,261 @@
   }
 }
 
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_info(AV1_COMMON *cm, int mi_row, int mi_col,
+                               BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, int type) {
+  if (type == 0) {
+    // TODO(chengchen): optimize step
+    LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+    // vertical direction
+    for (int r = mi_row; r < mi_row + mi_size_high[bsize]; ++r) {
+      for (int c = mi_col; c < mi_col + mi_size_wide[bsize];) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        // Y plane
+        const int blk_row = r & (mi_size_high[bsize] - 1);
+        const int blk_col = c & (mi_size_wide[bsize] - 1);
+        const TX_SIZE mb_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+            bsize, blk_row, blk_col)];
+        const TX_SIZE tx_size = txsize_horz_map[mb_tx_size];
+        lfm->tx_size_ver[0][tx_size].bits[index] |= mask;
+        // U/V plane
+        const TX_SIZE tx_size_uv = txsize_horz_map[av1_get_max_uv_txsize(
+            mbmi->sb_type, cm->seq_params.subsampling_x,
+            cm->seq_params.subsampling_y)];
+        lfm->tx_size_ver[1][tx_size_uv].bits[index] |= mask;
+
+        c += tx_size_wide_unit[tx_size];
+      }
+    }
+    // horizontal direction
+    for (int c = mi_col; c < mi_col + mi_size_wide[bsize]; ++c) {
+      for (int r = mi_row; r < mi_row + mi_size_high[bsize];) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        // Y plane
+        const int blk_row = r & (mi_size_high[bsize] - 1);
+        const int blk_col = c & (mi_size_wide[bsize] - 1);
+        const TX_SIZE mb_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+            bsize, blk_row, blk_col)];
+        const TX_SIZE tx_size = txsize_vert_map[mb_tx_size];
+        lfm->tx_size_hor[0][tx_size].bits[index] |= mask;
+        // U/V plane
+        const TX_SIZE tx_size_uv = txsize_vert_map[av1_get_max_uv_txsize(
+            mbmi->sb_type, cm->seq_params.subsampling_x,
+            cm->seq_params.subsampling_y)];
+        lfm->tx_size_hor[1][tx_size_uv].bits[index] |= mask;
+
+        r += tx_size_high_unit[tx_size];
+      }
+    }
+    // store other info
+    for (int r = mi_row; r < mi_row + mi_size_high[bsize]; ++r) {
+      for (int c = mi_col; c < mi_col + mi_size_wide[bsize];) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        if (mbmi->skip && is_inter_block(mbmi)) lfm->skip.bits[index] |= mask;
+        if (r == mi_row) lfm->is_horz_border.bits[index] |= mask;
+        if (c == mi_col) lfm->is_vert_border.bits[index] |= mask;
+        const int blk_row = r & (mi_size_high[bsize] - 1);
+        const int blk_col = c & (mi_size_wide[bsize] - 1);
+        const TX_SIZE mb_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+            bsize, blk_row, blk_col)];
+        c += tx_size_wide_unit[mb_tx_size];
+      }
+    }
+    for (int c = mi_col; c < mi_col + mi_size_wide[bsize]; ++c) {
+      for (int r = mi_row; r < mi_row + mi_size_high[bsize];) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        if (mbmi->skip && is_inter_block(mbmi)) lfm->skip.bits[index] |= mask;
+        if (r == mi_row) lfm->is_horz_border.bits[index] |= mask;
+        if (c == mi_col) lfm->is_vert_border.bits[index] |= mask;
+        const int blk_row = r & (mi_size_high[bsize] - 1);
+        const int blk_col = c & (mi_size_wide[bsize] - 1);
+        const TX_SIZE mb_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+            bsize, blk_row, blk_col)];
+        r += tx_size_high_unit[mb_tx_size];
+      }
+    }
+    const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+    const uint8_t level_vert_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+    const uint8_t level_vert_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+    const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+    const uint8_t level_horz_u = get_filter_level(cm, &cm->lf_info, 1, 1, mbmi);
+    const uint8_t level_horz_v = get_filter_level(cm, &cm->lf_info, 1, 2, mbmi);
+    const int col_start = mi_col % MI_SIZE_64X64;
+    for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+      const int row = r % MI_SIZE_64X64;
+      memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_u_ver[row][col_start], level_vert_u,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_v_ver[row][col_start], level_vert_v,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_u_hor[row][col_start], level_horz_u,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_v_hor[row][col_start], level_horz_v,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+    }
+  } else {
+    // TODO(chengchen): optimize step
+    LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+    // vertical direction
+    const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+    const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+    const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+        mbmi->sb_type, cm->seq_params.subsampling_x,
+        cm->seq_params.subsampling_y)];
+    const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+        mbmi->sb_type, cm->seq_params.subsampling_x,
+        cm->seq_params.subsampling_y)];
+    for (int r = mi_row; r < mi_row + mi_size_high[bsize];
+         r += tx_size_high_unit[mbmi->tx_size]) {
+      for (int c = mi_col; c < mi_col + mi_size_wide[bsize];
+           c += tx_size_wide_unit[mbmi->tx_size]) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        if (tx_size_y_vert <= TX_8X8) {
+          lfm->tx_size_ver[0][tx_size_y_horz].bits[index] |=
+              (left_txform_mask[tx_size_y_vert].bits[0] << shift);
+        } else if (tx_size_y_vert == TX_16X16) {
+          lfm->tx_size_ver[0][tx_size_y_horz].bits[index] |=
+              (left_txform_mask[tx_size_y_vert].bits[0] << col);
+        } else if (tx_size_y_vert == TX_32X32) {
+          for (int i = 0; i < 2; ++i) {
+            lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+                (left_txform_mask[tx_size_y_vert].bits[i] << col);
+          }
+        } else {
+          for (int i = 0; i < 4; ++i) {
+            lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+                (left_txform_mask[tx_size_y_vert].bits[i] << col);
+          }
+        }
+      }
+    }
+    for (int r = mi_row; r < mi_row + mi_size_high[bsize];
+         r += tx_size_high_unit[tx_size_uv_vert]) {
+      for (int c = mi_col; c < mi_col + mi_size_wide[bsize];
+           c += tx_size_wide_unit[tx_size_uv_horz]) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        if (tx_size_uv_vert <= TX_8X8) {
+          lfm->tx_size_ver[1][tx_size_uv_horz].bits[index] |=
+              (left_txform_mask[tx_size_uv_vert].bits[0] << shift);
+        } else if (tx_size_uv_vert == TX_16X16) {
+          lfm->tx_size_ver[1][tx_size_uv_horz].bits[index] |=
+              (left_txform_mask[tx_size_uv_vert].bits[0] << col);
+        } else if (tx_size_uv_vert == TX_32X32) {
+          for (int i = 0; i < 2; ++i) {
+            lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+                (left_txform_mask[tx_size_uv_vert].bits[i] << col);
+          }
+        } else {
+          for (int i = 0; i < 4; ++i) {
+            lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+                (left_txform_mask[tx_size_uv_vert].bits[i] << col);
+          }
+        }
+      }
+    }
+    // horizontal direction
+    for (int c = mi_col; c < mi_col + mi_size_wide[bsize];
+         c += tx_size_wide_unit[mbmi->tx_size]) {
+      for (int r = mi_row; r < mi_row + mi_size_high[bsize];
+           r += tx_size_high_unit[mbmi->tx_size]) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        lfm->tx_size_hor[0][tx_size_y_vert].bits[index] |=
+            (above_txform_mask[0][tx_size_y_horz] << shift);
+      }
+    }
+    for (int c = mi_col; c < mi_col + mi_size_wide[bsize];
+         c += tx_size_wide_unit[tx_size_uv_horz]) {
+      for (int r = mi_row; r < mi_row + mi_size_high[bsize];
+           r += tx_size_high_unit[tx_size_uv_vert]) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        lfm->tx_size_hor[1][tx_size_uv_vert].bits[index] |=
+            (above_txform_mask[0][tx_size_uv_horz] << shift);
+      }
+    }
+    // store other info
+    for (int r = mi_row; r < mi_row + mi_size_high[bsize]; ++r) {
+      for (int c = mi_col; c < mi_col + mi_size_wide[bsize];
+           c += tx_size_wide_unit[mbmi->tx_size]) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        if (mbmi->skip && is_inter_block(mbmi)) lfm->skip.bits[index] |= mask;
+        if (r == mi_row) lfm->is_horz_border.bits[index] |= mask;
+        if (c == mi_col) lfm->is_vert_border.bits[index] |= mask;
+      }
+    }
+    for (int c = mi_col; c < mi_col + mi_size_wide[bsize]; ++c) {
+      for (int r = mi_row; r < mi_row + mi_size_high[bsize];
+           r += tx_size_high_unit[mbmi->tx_size]) {
+        int index = 0;
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        if (mbmi->skip && is_inter_block(mbmi)) lfm->skip.bits[index] |= mask;
+        if (r == mi_row) lfm->is_horz_border.bits[index] |= mask;
+        if (c == mi_col) lfm->is_vert_border.bits[index] |= mask;
+      }
+    }
+    const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+    const uint8_t level_vert_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+    const uint8_t level_vert_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+    const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+    const uint8_t level_horz_u = get_filter_level(cm, &cm->lf_info, 1, 1, mbmi);
+    const uint8_t level_horz_v = get_filter_level(cm, &cm->lf_info, 1, 2, mbmi);
+    const int col_start = mi_col % MI_SIZE_64X64;
+    for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+      const int row = r % MI_SIZE_64X64;
+      memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_u_ver[row][col_start], level_vert_u,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_v_ver[row][col_start], level_vert_v,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_u_hor[row][col_start], level_horz_u,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+      memset(&lfm->lfl_v_hor[row][col_start], level_horz_v,
+             sizeof(uint8_t) * mi_size_wide[bsize]);
+    }
+  }
+}
+#endif
+
 static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
                                int mi_row, int mi_col, aom_reader *r,
                                PARTITION_TYPE partition, BLOCK_SIZE bsize) {
@@ -1353,12 +1608,18 @@
     for (int idy = 0; idy < height; idy += bh)
       for (int idx = 0; idx < width; idx += bw)
         read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
+#if LOOP_FILTER_BITMASK
+    store_bitmask_info(cm, mi_row, mi_col, bsize, mbmi, 0);
+#endif
   } else {
     mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
     if (inter_block_tx)
       memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
     set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
                   mbmi->skip && is_inter_block(mbmi), xd);
+#if LOOP_FILTER_BITMASK
+    store_bitmask_info(cm, mi_row, mi_col, bsize, mbmi, 1);
+#endif
   }
 
   if (cm->delta_q_present_flag) {
@@ -5199,6 +5460,11 @@
   const int tile_count_tg = end_tile - start_tile + 1;
 
   if (initialize_flag) setup_frame_info(pbi);
+  const int num_planes = av1_num_planes(cm);
+#if LOOP_FILTER_BITMASK
+  av1_loop_filter_frame_init(cm, 0, num_planes);
+  av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
+#endif
 
   if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
       pbi->row_mt)
@@ -5210,7 +5476,6 @@
   else
     *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
 
-  const int num_planes = av1_num_planes(cm);
   // If the bit stream is monochrome, set the U and V buffers to a constant.
   if (num_planes < 3) {
     set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);