Add a precision reduction step after nn_predict

The precision reduction step is a temporary fix to reduce the
probability of mismatches between C and SIMD implementations
for floating point av1_nn_predict(). Ideally the SIMD
implementation needs to be redone.

The patch fixes the issue in bug 2415, but there is no
guarantee that mismatches will never happen, since the
error is often larger than the reduced precision.

STATS_CHANGED in the noise range.
lowres (33 frames end-usage q cpu-used 0): +0.003
midres (33 frames end-usage q cpu-used 0): -0.011

BUG=aomedia:2415

Change-Id: I70298e0e35abfe86cb65ad12b7ee506f9b736e74
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 35f34b6..ee4325c 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -329,7 +329,7 @@
   add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
 
-  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, float *const output";
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
   specialize qw/av1_nn_predict sse3/;
 }
 # end encoder functions
diff --git a/av1/encoder/ml.c b/av1/encoder/ml.c
index b5d8a16..57228ec 100644
--- a/av1/encoder/ml.c
+++ b/av1/encoder/ml.c
@@ -15,11 +15,21 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/ml.h"
 
+void av1_nn_output_prec_reduce(float *const output, int num_output) {
+  const int prec_bits = 11;
+  const int prec = 1 << prec_bits;
+  const float inv_prec = (float)(1.0 / prec);
+  for (int i = 0; i < num_output; i++) {
+    output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec;
+  }
+}
+
 // Calculate prediction based on the given input features and neural net config.
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
 void av1_nn_predict_c(const float *input_nodes,
-                      const NN_CONFIG *const nn_config, float *const output) {
+                      const NN_CONFIG *const nn_config, int reduce_prec,
+                      float *const output) {
   int num_input_nodes = nn_config->num_inputs;
   int buf_index = 0;
   float buf[2][NN_MAX_NODES_PER_LAYER];
@@ -55,6 +65,7 @@
       val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
     output[node] = val;
   }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
 }
 
 #if CONFIG_NN_V2
@@ -107,7 +118,7 @@
 }
 
 void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config,
-                       float *output) {
+                       int reduce_prec, float *output) {
   const float *input_nodes = feature;
 
   // Propagate the layers.
@@ -124,6 +135,7 @@
   assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits);
   // Copy the final layer output
   memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits);
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits);
 }
 #endif  // CONFIG_NN_V2
 
diff --git a/av1/encoder/ml.h b/av1/encoder/ml.h
index e3c5eac..62d543d 100644
--- a/av1/encoder/ml.h
+++ b/av1/encoder/ml.h
@@ -63,7 +63,7 @@
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
 void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config,
-                       float *output);
+                       int reduce_prec, float *output);
 #endif  // CONFIG_NN_V2
 
 // Applies the softmax normalization function to the input
@@ -71,6 +71,10 @@
 // output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
 void av1_nn_softmax(const float *input, float *output, int n);
 
+// Applies a precision reduction to output of av1_nn_predict to prevent
+// mismatches between C and SIMD implementations.
+void av1_nn_output_prec_reduce(float *const output, int num_output);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index f16532a..f14830b 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -208,7 +208,7 @@
   }
 
   // Make decision
-  av1_nn_predict(dnn_features, dnn_config, logits);
+  av1_nn_predict(dnn_features, dnn_config, 1, logits);
   aom_clear_system_state();
 
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
@@ -287,7 +287,7 @@
 
   float score = 0.0f;
 
-  av1_nn_predict(features, nn_config, &score);
+  av1_nn_predict(features, nn_config, 1, &score);
   aom_clear_system_state();
 
   if (score > split_only_thresh) {
@@ -568,7 +568,7 @@
                               ? PARTITION_TYPES
                               : EXT_PARTITION_TYPES;
 
-  av1_nn_predict(features, nn_config, scores);
+  av1_nn_predict(features, nn_config, 1, scores);
   aom_clear_system_state();
 
   av1_nn_softmax(scores, probs, num_classes);
@@ -758,7 +758,7 @@
   assert(cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE);
 
   aom_clear_system_state();
-  av1_nn_predict(features, nn_config, scores);
+  av1_nn_predict(features, nn_config, 1, scores);
   av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
 
   int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
@@ -929,7 +929,7 @@
   assert(f_idx == FEATURES);
 
   float score = 0.0f;
-  av1_nn_predict(features, nn_config, &score);
+  av1_nn_predict(features, nn_config, 1, &score);
   // Score is indicator of confidence that we should NOT terminate.
   if (score < thresh) *terminate_partition_search = 1;
 }
@@ -1017,7 +1017,7 @@
 
   // 2. Do the prediction and prune 0-2 partitions based on their probabilities
   float raw_scores[3] = { 0.0f };
-  av1_nn_predict(features, nn_config, raw_scores);
+  av1_nn_predict(features, nn_config, 1, raw_scores);
   aom_clear_system_state();
   float probs[3] = { 0.0f };
   av1_nn_softmax(raw_scores, probs, 3);
@@ -1085,7 +1085,7 @@
 
   // Calculate scores using the NN model.
   float score[16] = { 0.0f };
-  av1_nn_predict(features, nn_config, score);
+  av1_nn_predict(features, nn_config, 1, score);
   aom_clear_system_state();
   int int_score[16];
   int max_score = -1000;
@@ -1225,7 +1225,7 @@
 
   // Calculate scores using the NN model.
   float score[LABELS] = { 0.0f };
-  av1_nn_predict(features, nn_config, score);
+  av1_nn_predict(features, nn_config, 1, score);
   aom_clear_system_state();
   int int_score[LABELS];
   int max_score = -1000;
@@ -1309,7 +1309,7 @@
 
   // Calculate score using the NN model.
   float score = 0.0f;
-  av1_nn_predict(features, nn_config, &score);
+  av1_nn_predict(features, nn_config, 1, &score);
   aom_clear_system_state();
 
   // Make decision.
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 9d0f6e6..ec90f6f 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -1178,7 +1178,7 @@
     /* clang-format on */
     // Infer using ML model.
     float score;
-    av1_nn_predict(features, &av1_use_flat_gop_nn_config, &score);
+    av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score);
     use_alt_ref = (score <= 0.0);
   }
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 35ba214..0c8d0aa 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1656,11 +1656,11 @@
                                   &vfeatures[vfeatures_num - 1]);
   aom_clear_system_state();
 #if CONFIG_NN_V2
-  av1_nn_predict_v2(hfeatures, nn_config_hor, hscores);
-  av1_nn_predict_v2(vfeatures, nn_config_ver, vscores);
+  av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
+  av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
 #else
-  av1_nn_predict(hfeatures, nn_config_hor, hscores);
-  av1_nn_predict(vfeatures, nn_config_ver, vscores);
+  av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
+  av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
 #endif
   aom_clear_system_state();
 
@@ -2660,8 +2660,8 @@
   features[7] = (float)vert_corr;
 
   float rate_f, dist_by_sse_norm_f;
-  av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
-  av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
+  av1_nn_predict(features, &av1_pustats_dist_nnconfig, 1, &dist_by_sse_norm_f);
+  av1_nn_predict(features, &av1_pustats_rate_nnconfig, 1, &rate_f);
   aom_clear_system_state();
   const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
   int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
@@ -5019,7 +5019,7 @@
   get_mean_dev_features(diff, diff_stride, bw, bh, 2, features);
 
   float score = 0.0f;
-  av1_nn_predict(features, nn_config, &score);
+  av1_nn_predict(features, nn_config, 1, &score);
   aom_clear_system_state();
   if (score > 8.0f) return 100;
   if (score < -8.0f) return 0;
diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c
index c520c3c..8a6c570 100644
--- a/av1/encoder/x86/ml_sse3.c
+++ b/av1/encoder/x86/ml_sse3.c
@@ -151,7 +151,7 @@
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
 void av1_nn_predict_sse3(const float *input_nodes,
-                         const NN_CONFIG *const nn_config,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
                          float *const output) {
   float buf[2][NN_MAX_NODES_PER_LAYER];
   int buf_index = 0;
@@ -240,4 +240,5 @@
     num_inputs = num_outputs;
     buf_index = 1 - buf_index;
   }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
 }
diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
index 0574a15..f6c06b9 100644
--- a/test/av1_nn_predict_test.cc
+++ b/test/av1_nn_predict_test.cc
@@ -25,7 +25,7 @@
 namespace {
 typedef void (*NnPredict_Func)(const float *const input_nodes,
                                const NN_CONFIG *const nn_config,
-                               float *const output);
+                               int reduce_prec, float *const output);
 
 typedef ::testing::tuple<const NnPredict_Func> NnPredictTestParam;
 
@@ -115,8 +115,8 @@
       weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
     }
 
-    av1_nn_predict_c(inputs, &nn_config, outputs_ref);
-    target_func_(inputs, &nn_config, outputs_test);
+    av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
+    target_func_(inputs, &nn_config, 0, outputs_test);
     libaom_test::ClearSystemState();
 
     for (int node = 0; node < shape->num_outputs; node++) {
@@ -155,13 +155,13 @@
   aom_usec_timer timer;
   aom_usec_timer_start(&timer);
   for (int i = 0; i < run_times; ++i) {
-    av1_nn_predict_c(inputs, &nn_config, outputs_ref);
+    av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
   }
   aom_usec_timer_mark(&timer);
   const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
   aom_usec_timer_start(&timer);
   for (int i = 0; i < run_times; ++i) {
-    target_func_(inputs, &nn_config, outputs_test);
+    target_func_(inputs, &nn_config, 0, outputs_test);
   }
   aom_usec_timer_mark(&timer);
   libaom_test::ClearSystemState();