Add a precision reduction step after nn_predict The precision reduction step is a temporary fix to reduce the probability of mismatches between C and SIMD implementations for floating point av1_nn_predict(). Ideally the SIMD implementation needs to be redone. The patch fixes the issue in bug 2415, but there is no guarantee that mismatches will never happen, since the error is often larger than the reduced precision. STATS_CHANGED in the noise range. lowres (33 frames end-usage q cpu-used 0): +0.003 midres (33 frames end-usage q cpu-used 0): -0.011 BUG=aomedia:2415 Change-Id: I70298e0e35abfe86cb65ad12b7ee506f9b736e74

commit: d44f5d12c2c6dd6a81ebc0ee54c786f649885503 [log] [tgz]
author: Debargha Mukherjee <debargha@google.com> Thu Jun 27 14:56:05 2019 -0700
committer: Debargha Mukherjee <debargha@google.com> Tue Jul 02 16:41:07 2019 +0000
tree: 560ca0d812927b6e2d4ed6048ba22fd4bd75810b
parent: 39fca5f5821c4e77a60e0d4c29726b842d388712 [diff]
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 35f34b6..ee4325c 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -329,7 +329,7 @@
   add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
 
-  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, float *const output";
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
   specialize qw/av1_nn_predict sse3/;
 }
 # end encoder functions

diff --git a/av1/encoder/ml.c b/av1/encoder/ml.c
index b5d8a16..57228ec 100644
--- a/av1/encoder/ml.c
+++ b/av1/encoder/ml.c

@@ -15,11 +15,21 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/ml.h"
 
+void av1_nn_output_prec_reduce(float *const output, int num_output) {
+  const int prec_bits = 11;
+  const int prec = 1 << prec_bits;
+  const float inv_prec = (float)(1.0 / prec);
+  for (int i = 0; i < num_output; i++) {
+    output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec;
+  }
+}
+
 // Calculate prediction based on the given input features and neural net config.
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
 void av1_nn_predict_c(const float *input_nodes,
-                      const NN_CONFIG *const nn_config, float *const output) {
+                      const NN_CONFIG *const nn_config, int reduce_prec,
+                      float *const output) {
   int num_input_nodes = nn_config->num_inputs;
   int buf_index = 0;
   float buf[2][NN_MAX_NODES_PER_LAYER];
@@ -55,6 +65,7 @@
       val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
     output[node] = val;
   }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
 }
 
 #if CONFIG_NN_V2
@@ -107,7 +118,7 @@
 }
 
 void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config,
-                       float *output) {
+                       int reduce_prec, float *output) {
   const float *input_nodes = feature;
 
   // Propagate the layers.
@@ -124,6 +135,7 @@
   assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits);
   // Copy the final layer output
   memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits);
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits);
 }
 #endif  // CONFIG_NN_V2
 

diff --git a/av1/encoder/ml.h b/av1/encoder/ml.h
index e3c5eac..62d543d 100644
--- a/av1/encoder/ml.h
+++ b/av1/encoder/ml.h

@@ -63,7 +63,7 @@
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
 void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config,
-                       float *output);
+                       int reduce_prec, float *output);
 #endif  // CONFIG_NN_V2
 
 // Applies the softmax normalization function to the input
@@ -71,6 +71,10 @@
 // output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
 void av1_nn_softmax(const float *input, float *output, int n);
 
+// Applies a precision reduction to output of av1_nn_predict to prevent
+// mismatches between C and SIMD implementations.
+void av1_nn_output_prec_reduce(float *const output, int num_output);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index f16532a..f14830b 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c

@@ -208,7 +208,7 @@
   }
 
   // Make decision
-  av1_nn_predict(dnn_features, dnn_config, logits);
+  av1_nn_predict(dnn_features, dnn_config, 1, logits);
   aom_clear_system_state();
 
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
@@ -287,7 +287,7 @@
 
   float score = 0.0f;
 
-  av1_nn_predict(features, nn_config, &score);
+  av1_nn_predict(features, nn_config, 1, &score);
   aom_clear_system_state();
 
   if (score > split_only_thresh) {
@@ -568,7 +568,7 @@
                               ? PARTITION_TYPES
                               : EXT_PARTITION_TYPES;
 
-  av1_nn_predict(features, nn_config, scores);
+  av1_nn_predict(features, nn_config, 1, scores);
   aom_clear_system_state();
 
   av1_nn_softmax(scores, probs, num_classes);
@@ -758,7 +758,7 @@
   assert(cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE);
 
   aom_clear_system_state();
-  av1_nn_predict(features, nn_config, scores);
+  av1_nn_predict(features, nn_config, 1, scores);
   av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
 
   int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
@@ -929,7 +929,7 @@
   assert(f_idx == FEATURES);
 
   float score = 0.0f;
-  av1_nn_predict(features, nn_config, &score);
+  av1_nn_predict(features, nn_config, 1, &score);
   // Score is indicator of confidence that we should NOT terminate.
   if (score < thresh) *terminate_partition_search = 1;
 }
@@ -1017,7 +1017,7 @@
 
   // 2. Do the prediction and prune 0-2 partitions based on their probabilities
   float raw_scores[3] = { 0.0f };
-  av1_nn_predict(features, nn_config, raw_scores);
+  av1_nn_predict(features, nn_config, 1, raw_scores);
   aom_clear_system_state();
   float probs[3] = { 0.0f };
   av1_nn_softmax(raw_scores, probs, 3);
@@ -1085,7 +1085,7 @@
 
   // Calculate scores using the NN model.
   float score[16] = { 0.0f };
-  av1_nn_predict(features, nn_config, score);
+  av1_nn_predict(features, nn_config, 1, score);
   aom_clear_system_state();
   int int_score[16];
   int max_score = -1000;
@@ -1225,7 +1225,7 @@
 
   // Calculate scores using the NN model.
   float score[LABELS] = { 0.0f };
-  av1_nn_predict(features, nn_config, score);
+  av1_nn_predict(features, nn_config, 1, score);
   aom_clear_system_state();
   int int_score[LABELS];
   int max_score = -1000;
@@ -1309,7 +1309,7 @@
 
   // Calculate score using the NN model.
   float score = 0.0f;
-  av1_nn_predict(features, nn_config, &score);
+  av1_nn_predict(features, nn_config, 1, &score);
   aom_clear_system_state();
 
   // Make decision.

diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 9d0f6e6..ec90f6f 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c

@@ -1178,7 +1178,7 @@
     /* clang-format on */
     // Infer using ML model.
     float score;
-    av1_nn_predict(features, &av1_use_flat_gop_nn_config, &score);
+    av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score);
     use_alt_ref = (score <= 0.0);
   }
 

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 35ba214..0c8d0aa 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -1656,11 +1656,11 @@
                                   &vfeatures[vfeatures_num - 1]);
   aom_clear_system_state();
 #if CONFIG_NN_V2
-  av1_nn_predict_v2(hfeatures, nn_config_hor, hscores);
-  av1_nn_predict_v2(vfeatures, nn_config_ver, vscores);
+  av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
+  av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
 #else
-  av1_nn_predict(hfeatures, nn_config_hor, hscores);
-  av1_nn_predict(vfeatures, nn_config_ver, vscores);
+  av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
+  av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
 #endif
   aom_clear_system_state();
 
@@ -2660,8 +2660,8 @@
   features[7] = (float)vert_corr;
 
   float rate_f, dist_by_sse_norm_f;
-  av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
-  av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
+  av1_nn_predict(features, &av1_pustats_dist_nnconfig, 1, &dist_by_sse_norm_f);
+  av1_nn_predict(features, &av1_pustats_rate_nnconfig, 1, &rate_f);
   aom_clear_system_state();
   const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
   int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
@@ -5019,7 +5019,7 @@
   get_mean_dev_features(diff, diff_stride, bw, bh, 2, features);
 
   float score = 0.0f;
-  av1_nn_predict(features, nn_config, &score);
+  av1_nn_predict(features, nn_config, 1, &score);
   aom_clear_system_state();
   if (score > 8.0f) return 100;
   if (score < -8.0f) return 0;

diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c
index c520c3c..8a6c570 100644
--- a/av1/encoder/x86/ml_sse3.c
+++ b/av1/encoder/x86/ml_sse3.c

@@ -151,7 +151,7 @@
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
 void av1_nn_predict_sse3(const float *input_nodes,
-                         const NN_CONFIG *const nn_config,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
                          float *const output) {
   float buf[2][NN_MAX_NODES_PER_LAYER];
   int buf_index = 0;
@@ -240,4 +240,5 @@
     num_inputs = num_outputs;
     buf_index = 1 - buf_index;
   }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
 }

diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
index 0574a15..f6c06b9 100644
--- a/test/av1_nn_predict_test.cc
+++ b/test/av1_nn_predict_test.cc

@@ -25,7 +25,7 @@
 namespace {
 typedef void (*NnPredict_Func)(const float *const input_nodes,
                                const NN_CONFIG *const nn_config,
-                               float *const output);
+                               int reduce_prec, float *const output);
 
 typedef ::testing::tuple<const NnPredict_Func> NnPredictTestParam;
 
@@ -115,8 +115,8 @@
       weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
     }
 
-    av1_nn_predict_c(inputs, &nn_config, outputs_ref);
-    target_func_(inputs, &nn_config, outputs_test);
+    av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
+    target_func_(inputs, &nn_config, 0, outputs_test);
     libaom_test::ClearSystemState();
 
     for (int node = 0; node < shape->num_outputs; node++) {
@@ -155,13 +155,13 @@
   aom_usec_timer timer;
   aom_usec_timer_start(&timer);
   for (int i = 0; i < run_times; ++i) {
-    av1_nn_predict_c(inputs, &nn_config, outputs_ref);
+    av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
   }
   aom_usec_timer_mark(&timer);
   const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
   aom_usec_timer_start(&timer);
   for (int i = 0; i < run_times; ++i) {
-    target_func_(inputs, &nn_config, outputs_test);
+    target_func_(inputs, &nn_config, 0, outputs_test);
   }
   aom_usec_timer_mark(&timer);
   libaom_test::ClearSystemState();
commit	d44f5d12c2c6dd6a81ebc0ee54c786f649885503	[log] [tgz]
author	Debargha Mukherjee <debargha@google.com>	Thu Jun 27 14:56:05 2019 -0700
committer	Debargha Mukherjee <debargha@google.com>	Tue Jul 02 16:41:07 2019 +0000
tree	560ca0d812927b6e2d4ed6048ba22fd4bd75810b
parent	39fca5f5821c4e77a60e0d4c29726b842d388712 [diff]