Hybrid forward transforms 16x16 AVX2 optimization

- Unit tests are added for AVX2 SIMD.
- Encoder speed improvement:
  AV1 baseline and EXT_TX, three 1080p sequences at bitrate:
  800 Kbps, 2 Mbps, 6 Mbps, on i7-6700 CPU, average
  user level time reduction: 3.86%.

Change-Id: Ibbd7837ee3a831c6b1e4e471bf6c8d3fa3a19ff4
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 4bf89f9..ad19c55 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -378,7 +378,7 @@
 specialize qw/av1_fht8x8 sse2/;
 
 add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-specialize qw/av1_fht16x16 sse2/;
+specialize qw/av1_fht16x16 sse2 avx2/;
 
 add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 specialize qw/av1_fht32x32/;