Add av1_get_nz_map_contexts_sse2()

10x - 50x faster than C code.

av1_cost_coeffs_txb() is about 6% faster.

av1_cost_coeffs() is about 3% faster.

Change-Id: Ib9cbed02a65b9cb0c5deb7a5d99c95d0d8ba32c0
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index d566d14..c3c3613 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -486,6 +486,12 @@
 
   # End av1_high encoder functions
 
+  # txb
+  if (aom_config("CONFIG_LV_MAP") eq "yes") {
+    add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_TYPE tx_type, int8_t *const coeff_contexts";
+    specialize qw/av1_get_nz_map_contexts sse2/;
+  }
+
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2/;
   add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";