Optimize get_nz_mag

Unroll loop and use clipping table for AOMMIN(level, 3)

Change-Id: If7d5b82f82768dcbaf859eb719d050509ae9e84f
diff --git a/av1/common/txb_common.h b/av1/common/txb_common.h
index 336e1cc..b466cf3 100644
--- a/av1/common/txb_common.h
+++ b/av1/common/txb_common.h
@@ -436,29 +436,41 @@
   { 0, 2 }, { 0, 3 }, { 0, 4 }
 };
 
+static const uint8_t clip_max3[256] = {
+  0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
 static INLINE int get_nz_mag(const uint8_t *const levels, const int bwl,
                              const TX_CLASS tx_class) {
   int mag;
 
   // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
-  mag = AOMMIN(levels[1], 3);                         // { 0, 1 }
-  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR], 3);  // { 1, 0 }
+  mag = clip_max3[levels[1]];                         // { 0, 1 }
+  mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR]];  // { 1, 0 }
 
-  for (int idx = 0; idx < SIG_REF_DIFF_OFFSET_NUM; ++idx) {
-    const int row_offset =
-        ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][0]
-                                   : ((tx_class == TX_CLASS_VERT)
-                                          ? sig_ref_diff_offset_vert[idx][0]
-                                          : sig_ref_diff_offset_horiz[idx][0]));
-    const int col_offset =
-        ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][1]
-                                   : ((tx_class == TX_CLASS_VERT)
-                                          ? sig_ref_diff_offset_vert[idx][1]
-                                          : sig_ref_diff_offset_horiz[idx][1]));
-    const int nb_pos =
-        (row_offset << bwl) + (row_offset << TX_PAD_HOR_LOG2) + col_offset;
-    mag += AOMMIN(levels[nb_pos], 3);
+  if (tx_class == TX_CLASS_2D) {
+    mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR + 1]];          // { 1, 1 }
+    mag += clip_max3[levels[2]];                                    // { 0, 2 }
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+  } else if (tx_class == TX_CLASS_VERT) {
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+    mag += clip_max3[levels[(3 << bwl) + (3 << TX_PAD_HOR_LOG2)]];  // { 3, 0 }
+    mag += clip_max3[levels[(4 << bwl) + (4 << TX_PAD_HOR_LOG2)]];  // { 4, 0 }
+  } else {
+    mag += clip_max3[levels[2]];  // { 0, 2 }
+    mag += clip_max3[levels[3]];  // { 0, 3 }
+    mag += clip_max3[levels[4]];  // { 0, 4 }
   }
+
   return mag;
 }