Speed-up weight calculation during highbd temporal filtering

In parent version, the weights used during highbd temporal
filtering were calculated using exp() function. In this CL,
exp() is replaced by approx_exp().

                 Instruction Count        BD-Rate Loss(%)
cpu    Testset     Reduction(%)      avg.psnr  ovr.psnr  ssim
 5     LOWRES2       2.273           0.0175    0.0476    0.0476
 5     MIDRES2       2.802           0.0673    0.0309    0.0309
 5      HDRES2       3.333           0.0116   -0.0029   -0.0029
 6     LOWRES2       1.218           0.0246    0.0398    0.0398
 6     MIDRES2       2.438           0.0379    0.0366    0.0366
 6      HDRES2       2.853           0.0406    0.0114    0.0114

STATS_CHANGED for highbd,speed=5,6

Change-Id: I3b68a8687697fbd6af634a21cf8a45be6cf94973
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ac44a5c..0a6db02 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -405,7 +405,7 @@
     add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
     specialize qw/av1_apply_temporal_filter sse2 avx2 neon/;
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+      add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
       specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
     }
   }
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 95a6060..438fd0f 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1183,8 +1183,7 @@
   }
 
   if (speed >= 5) {
-    // TODO(Ranjit): Enable the optimization for highbd encoding mode
-    sf->hl_sf.weight_calc_level_in_tf = use_hbd ? 0 : 1;
+    sf->hl_sf.weight_calc_level_in_tf = 1;
 
     sf->fp_sf.reduce_mv_step_param = 4;
 
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 7ec4781..d58f326 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -725,11 +725,12 @@
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_levels, const MV *subblock_mvs,
     const int *subblock_mses, const int q_factor, const int filter_strength,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
   av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
                               num_planes, noise_levels, subblock_mvs,
-                              subblock_mses, q_factor, filter_strength, 0, pred,
-                              accum, count);
+                              subblock_mses, q_factor, filter_strength,
+                              tf_wgt_calc_lvl, pred, accum, count);
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 /*!\brief Normalizes the accumulated filtering result to produce the filtered
@@ -818,6 +819,7 @@
   const int mi_h = mi_size_high_log2[block_size];
   const int mi_w = mi_size_wide_log2[block_size];
   const int num_planes = av1_num_planes(&cpi->common);
+  const int weight_calc_level_in_tf = cpi->sf.hl_sf.weight_calc_level_in_tf;
   uint32_t *accum = tf_data->accum;
   uint16_t *count = tf_data->count;
   uint8_t *pred = tf_data->pred;
@@ -874,13 +876,13 @@
             av1_highbd_apply_temporal_filter(
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
                 noise_levels, subblock_mvs, subblock_mses, q_factor,
-                filter_strength, pred, accum, count);
+                filter_strength, weight_calc_level_in_tf, pred, accum, count);
           } else {
 #endif  // CONFIG_AV1_HIGHBITDEPTH
             av1_apply_temporal_filter_c(
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
                 noise_levels, subblock_mvs, subblock_mses, q_factor,
-                filter_strength, 0, pred, accum, count);
+                filter_strength, weight_calc_level_in_tf, pred, accum, count);
 #if CONFIG_AV1_HIGHBITDEPTH
           }
 #endif            // CONFIG_AV1_HIGHBITDEPTH
@@ -889,14 +891,12 @@
             av1_apply_temporal_filter(
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
                 noise_levels, subblock_mvs, subblock_mses, q_factor,
-                filter_strength, cpi->sf.hl_sf.weight_calc_level_in_tf, pred,
-                accum, count);
+                filter_strength, weight_calc_level_in_tf, pred, accum, count);
           } else {
             av1_apply_temporal_filter_c(
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
                 noise_levels, subblock_mvs, subblock_mses, q_factor,
-                filter_strength, cpi->sf.hl_sf.weight_calc_level_in_tf, pred,
-                accum, count);
+                filter_strength, weight_calc_level_in_tf, pred, accum, count);
           }
         }
       }
diff --git a/av1/encoder/x86/highbd_temporal_filter_avx2.c b/av1/encoder/x86/highbd_temporal_filter_avx2.c
index 68509fa..42848d7 100644
--- a/av1/encoder/x86/highbd_temporal_filter_avx2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -13,6 +13,7 @@
 #include <immintrin.h>
 
 #include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/temporal_filter.h"
 
@@ -147,7 +148,8 @@
     const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
     uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
     const double inv_num_ref_pixels, const double decay_factor,
-    const double inv_factor, const double weight_factor, double *d_factor) {
+    const double inv_factor, const double weight_factor, double *d_factor,
+    int tf_wgt_calc_lvl) {
   assert(((block_width == 16) || (block_width == 32)) &&
          ((block_height == 16) || (block_height == 32)));
 
@@ -304,28 +306,57 @@
     acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
   }
 
-  for (int i = 0, k = 0; i < block_height; i++) {
-    for (int j = 0; j < block_width; j++, k++) {
-      const int pixel_value = frame2[i * stride2 + j];
-      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+  if (tf_wgt_calc_lvl == 0) {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
-      // Scale down the difference for high bit depth input.
-      diff_sse >>= ((bd - 8) * 2);
+        // Scale down the difference for high bit depth input.
+        diff_sse >>= ((bd - 8) * 2);
 
-      const double window_error = diff_sse * inv_num_ref_pixels;
-      const int subblock_idx =
-          (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error = (double)subblock_mses[subblock_idx];
-      const double combined_error =
-          weight_factor * window_error + block_error * inv_factor;
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
 
-      double scaled_error =
-          combined_error * d_factor[subblock_idx] * decay_factor;
-      scaled_error = AOMMIN(scaled_error, 7);
-      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
-      count[k] += weight;
-      accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  } else {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        // Scale down the difference for high bit depth input.
+        diff_sse >>= ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
     }
   }
 }
@@ -335,7 +366,8 @@
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_levels, const MV *subblock_mvs,
     const int *subblock_mses, const int q_factor, const int filter_strength,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
   assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
@@ -424,7 +456,7 @@
         ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
         subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
         luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
-        weight_factor, d_factor);
+        weight_factor, d_factor, tf_wgt_calc_lvl);
     plane_offset += plane_h * plane_w;
   }
 }
diff --git a/av1/encoder/x86/highbd_temporal_filter_sse2.c b/av1/encoder/x86/highbd_temporal_filter_sse2.c
index 1bfdaf7..f48b27b 100644
--- a/av1/encoder/x86/highbd_temporal_filter_sse2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -13,6 +13,7 @@
 #include <emmintrin.h>
 
 #include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/temporal_filter.h"
 
@@ -95,7 +96,8 @@
     const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
     uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
     const double inv_num_ref_pixels, const double decay_factor,
-    const double inv_factor, const double weight_factor, double *d_factor) {
+    const double inv_factor, const double weight_factor, double *d_factor,
+    int tf_wgt_calc_lvl) {
   assert(((block_width == 16) || (block_width == 32)) &&
          ((block_height == 16) || (block_height == 32)));
 
@@ -179,28 +181,57 @@
     }
   }
 
-  for (int i = 0, k = 0; i < block_height; i++) {
-    for (int j = 0; j < block_width; j++, k++) {
-      const int pixel_value = frame2[i * stride2 + j];
-      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+  if (tf_wgt_calc_lvl == 0) {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
-      // Scale down the difference for high bit depth input.
-      diff_sse >>= ((bd - 8) * 2);
+        // Scale down the difference for high bit depth input.
+        diff_sse >>= ((bd - 8) * 2);
 
-      const double window_error = diff_sse * inv_num_ref_pixels;
-      const int subblock_idx =
-          (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error = (double)subblock_mses[subblock_idx];
-      const double combined_error =
-          weight_factor * window_error + block_error * inv_factor;
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
 
-      double scaled_error =
-          combined_error * d_factor[subblock_idx] * decay_factor;
-      scaled_error = AOMMIN(scaled_error, 7);
-      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
-      count[k] += weight;
-      accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
+    }
+  } else {
+    for (int i = 0, k = 0; i < block_height; i++) {
+      for (int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame2[i * stride2 + j];
+        uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+        // Scale down the difference for high bit depth input.
+        diff_sse >>= ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+
+        count[k] += weight;
+        accumulator[k] += weight * pixel_value;
+      }
     }
   }
 }
@@ -210,7 +241,8 @@
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_levels, const MV *subblock_mvs,
     const int *subblock_mses, const int q_factor, const int filter_strength,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
   assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
@@ -299,7 +331,7 @@
         ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
         subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
         luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
-        weight_factor, d_factor);
+        weight_factor, d_factor, tf_wgt_calc_lvl);
     plane_offset += plane_h * plane_w;
   }
 }
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 79d7e52..be92063 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -316,7 +316,7 @@
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_level, const MV *subblock_mvs,
     const int *subblock_mses, const int q_factor, const int filter_strenght,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count);
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
 typedef libaom_test::FuncParam<HBDTemporalFilterFunc>
     HBDTemporalFilterFuncParam;
 
@@ -328,6 +328,7 @@
   virtual ~HBDTemporalFilterTest() {}
   virtual void SetUp() {
     params_ = GET_PARAM(0);
+    tf_wgt_calc_lvl_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src1_ = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * MAX_MB_PLANE * BH * BW));
@@ -389,6 +390,7 @@
 
  protected:
   HBDTemporalFilterFuncParam params_;
+  int tf_wgt_calc_lvl_;
   uint16_t *src1_;
   uint16_t *src2_;
   ACMRandom rnd_;
@@ -477,20 +479,20 @@
 
     params_.ref_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
                      num_planes, sigma, subblock_mvs, subblock_mses, q_factor,
-                     filter_strength, CONVERT_TO_BYTEPTR(src2_),
-                     accumulator_ref, count_ref);
+                     filter_strength, tf_wgt_calc_lvl_,
+                     CONVERT_TO_BYTEPTR(src2_), accumulator_ref, count_ref);
     params_.tst_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
                      num_planes, sigma, subblock_mvs, subblock_mses, q_factor,
-                     filter_strength, CONVERT_TO_BYTEPTR(src2_),
-                     accumulator_mod, count_mod);
+                     filter_strength, tf_wgt_calc_lvl_,
+                     CONVERT_TO_BYTEPTR(src2_), accumulator_mod, count_mod);
 
     if (run_times > 1) {
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < run_times; j++) {
         params_.ref_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
                          num_planes, sigma, subblock_mvs, subblock_mses,
-                         q_factor, filter_strength, CONVERT_TO_BYTEPTR(src2_),
-                         accumulator_ref, count_ref);
+                         q_factor, filter_strength, tf_wgt_calc_lvl_,
+                         CONVERT_TO_BYTEPTR(src2_), accumulator_ref, count_ref);
       }
       aom_usec_timer_mark(&ref_timer);
       const int elapsed_time_c =
@@ -500,8 +502,8 @@
       for (int j = 0; j < run_times; j++) {
         params_.tst_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
                          num_planes, sigma, subblock_mvs, subblock_mses,
-                         q_factor, filter_strength, CONVERT_TO_BYTEPTR(src2_),
-                         accumulator_mod, count_mod);
+                         q_factor, filter_strength, tf_wgt_calc_lvl_,
+                         CONVERT_TO_BYTEPTR(src2_), accumulator_mod, count_mod);
       }
       aom_usec_timer_mark(&test_timer);
       const int elapsed_time_simd =
@@ -558,7 +560,7 @@
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
                          Combine(ValuesIn(HBDtemporal_filter_test_sse2),
-                                 Range(64, 65, 4)));
+                                 Values(0, 1)));
 #endif  // HAVE_SSE2
 #if HAVE_AVX2
 HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
@@ -567,7 +569,7 @@
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
                          Combine(ValuesIn(HBDtemporal_filter_test_avx2),
-                                 Range(64, 65, 4)));
+                                 Values(0, 1)));
 #endif  // HAVE_AVX2
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace