Hybrid forward transforms 16x16 AVX2 optimization

- Unit tests are added for AVX2 SIMD.
- Encoder speed improvement:
  AV1 baseline and EXT_TX, three 1080p sequences at bitrate:
  800 Kbps, 2 Mbps, 6 Mbps, on i7-6700 CPU, average
  user level time reduction: 3.86%.

Change-Id: Ibbd7837ee3a831c6b1e4e471bf6c8d3fa3a19ff4
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 948995d..51f4d3c 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -9049,7 +9049,8 @@
             uint8_t drl1_ctx = 0;
             drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
                                    i + idx_offset);
-            tmp_rate += (tmp_rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1] : 0);
+            tmp_rate +=
+                (tmp_rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1] : 0);
           }
 
           if (mbmi_ext->ref_mv_count[ref_frame_type] >