Fix the overflow of av1_fht32x32() in 2D DCT_DCT

- Use range check function to avoid DCT_DCT overflow.
  We need to re-develop the column txfm side scaling/rounding. Now,
  we prefer to maintain the current BDRate level.
- Encoder user level time reduction <1% owing to av1_fht32x32_avx2.
- Add MemCheck unit test and fdct32() unit test.

Change-Id: I1e67030f67bc637859798ebe2f6698afffb8531c
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index a949ebf..3d07b44 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc
@@ -69,6 +69,7 @@
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
     pitch_ = 32;
+    height_ = 32;
     fwd_txfm_ref = fht32x32_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
@@ -90,6 +91,7 @@
 };
 
 TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
 
 #if CONFIG_AOM_HIGHBITDEPTH
 class AV1HighbdTrans32x32HT
@@ -164,8 +166,7 @@
 
 #if HAVE_AVX2
 const Ht32x32Param kArrayHt32x32Param_avx2[] = {
-  // TODO(luoyi): DCT_DCT tx_type is not enabled in av1_fht32x32_c(avx2) yet.
-  // make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 1024),