Speed up  tm_predictor_4x4

tm_predictor_4x4 is implemented with SSE2 using XMM registers.
Speed up by ~25% in ./test_intra_pred_speed.

Change-Id: I25074b78d476a2cb17f81cf654bdfd80df2070e0
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 5d59e83..d44a64a 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -191,9 +191,14 @@
 INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
                 vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
                 vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse)
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE && CONFIG_USE_X86INC
 
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
                 vpx_h_predictor_4x4_ssse3, vpx_d45_predictor_4x4_ssse3, NULL,