Partial inv txfm based on eob for ssse3

Add implementation of partial inv txfm based on eob
for ssse3 version functions.

For decoder, profiling results shows the cpu time
of inv txfm drops from 3.01% to 2.14%.
Commit ID: d0d1422
Sequence: 150 frames of night_720p30
Config: Default (Disable AVX2, compare ssse3 with
and without eob)

Change-Id: I941feef7d5afcbf2cb214df57a0ff167cc4d964e
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index c07ce09..bb5dd49 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -18,6 +18,7 @@
 
 #include "aom_ports/aom_timer.h"
 #include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/scan.h"
 #include "test/acm_random.h"
 #include "test/av1_txfm_test.h"
 #include "test/util.h"
@@ -252,20 +253,31 @@
   int stride = BLK_WIDTH;
   int rows = tx_size_high[tx_size];
   int cols = tx_size_wide[tx_size];
-  run_times = AOMMAX(run_times / (rows * cols), 1);
+  const int rows_nonezero = AOMMIN(32, rows);
+  const int cols_nonezero = AOMMIN(32, cols);
+  run_times /= (rows * cols);
+  run_times = AOMMAX(1, run_times);
+  const SCAN_ORDER *scan_order = get_default_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int16_t eobmax = rows_nonezero * cols_nonezero;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int randTimes = run_times == 1 ? 500 : 1;
+  int randTimes = run_times == 1 ? (eobmax + 500) : 1;
   for (int cnt = 0; cnt < randTimes; ++cnt) {
-    const int16_t max_in = (1 << (bd + 1)) - 1;
-    for (int r = 0; r < rows; ++r) {
-      for (int c = 0; c < cols; ++c) {
+    const int16_t max_in = (1 << (bd)) - 1;
+    for (int r = 0; r < BLK_WIDTH; ++r) {
+      for (int c = 0; c < BLK_WIDTH; ++c) {
         input[r * cols + c] = (cnt == 0) ? max_in : rnd.Rand8Extremes();
         output[r * stride + c] = (cnt == 0) ? 128 : rnd.Rand8();
         ref_output[r * stride + c] = output[r * stride + c];
       }
     }
     fwd_func_(input, inv_input, stride, tx_type, bd);
-    int eob = AOMMIN(32, rows) * AOMMIN(32, cols);
+
+    // produce eob input by setting high freq coeffs to zero
+    const int eob = AOMMIN(cnt + 1, eobmax);
+    for (int i = eob; i < eobmax; i++) {
+      inv_input[scan[i]] = 0;
+    }
 
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
@@ -273,13 +285,13 @@
       ref_func_(inv_input, ref_output, stride, tx_type, bd);
     }
     aom_usec_timer_mark(&timer);
-    double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
     aom_usec_timer_start(&timer);
     for (int i = 0; i < run_times; ++i) {
       target_func_(inv_input, output, stride, tx_type, tx_size, eob);
     }
     aom_usec_timer_mark(&timer);
-    double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
     if (run_times > 10) {
       printf("txfm[%d] %3dx%-3d:%7.2f/%7.2fns", tx_type, cols, rows, time1,
              time2);
@@ -289,8 +301,9 @@
       for (int c = 0; c < cols; ++c) {
         uint8_t ref_value = static_cast<uint8_t>(ref_output[r * stride + c]);
         ASSERT_EQ(ref_value, output[r * stride + c])
-            << "[" << r << "," << c << "] " << cnt << " tx_size: " << tx_size
-            << " tx_type: " << tx_type;
+            << "[" << r << "," << c << "] " << cnt
+            << " tx_size: " << static_cast<int>(tx_size)
+            << " tx_type: " << tx_type << " eob " << eob;
       }
     }
   }