SSE3-optimised av1_nn_predict

I have developed a SIMD-optimised neural network implementation using
SSE3.  I have also added functional equivalence tests between this and
the original implementation.  I added aom_clear_system_state() to a few
places where FPU operations are used after av1_nn_predict.

Speed-ups over the original C implementation for various network shapes:
10x64x16: 1.72x
12x12x1:  2.72x
12x24x1:  2.35x
12x32x1:  3.34x
18x24x4:  0.94x
18x32x4:  0.93x
4x16x1:   2.01x
8x16x1:   1.89x
8x16x4:   2.02x
8x24x1:   2.77x
8x32x1:   2.98x
8x64x1:   3.76x
9x32x3:   1.08x
4x8x4:    1.66x

A few awkwardly-shaped networks are slightly slower: these could be
padded to more convenient sizes to use the SIMD kernels.

I also wrote an AVX/AVX2 implementation but on these relatively small
networks it was barely faster than the SSE3 code.

Change-Id: I6a72be12cb7df8cf946578c3e01b21a439377d45
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 7976746..8c92615 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -264,6 +264,8 @@
             "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
+
 list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
 
@@ -382,6 +384,14 @@
     endif()
   endif()
 
+  if(HAVE_SSE3)
+    require_compiler_flag_nomsvc("-msse3" NO)
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-msse3" "sse3" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_SSE3" "aom")
+    endif()
+  endif()
+
   if(HAVE_SSSE3)
     require_compiler_flag_nomsvc("-mssse3" NO)
     add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e38f897..c167c65 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -33,6 +33,8 @@
 struct aom_variance_vtable;
 struct search_site_config;
 struct yv12_buffer_config;
+struct NN_CONFIG;
+typedef struct NN_CONFIG NN_CONFIG;
 
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
@@ -308,6 +310,9 @@
 
   add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
+
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, float *const output";
+  specialize qw/av1_nn_predict sse3/;
 }
 # end encoder functions
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index d5833f2..e5b7120 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2895,6 +2895,7 @@
   // 2. Do the prediction and prune 0-2 partitions based on their probabilities
   float raw_scores[3] = { 0.0f };
   av1_nn_predict(features, nn_config, raw_scores);
+  aom_clear_system_state();
   float probs[3] = { 0.0f };
   av1_nn_softmax(raw_scores, probs, 3);
 
@@ -2962,6 +2963,7 @@
   // Calculate scores using the NN model.
   float score[16] = { 0.0f };
   av1_nn_predict(features, nn_config, score);
+  aom_clear_system_state();
   int int_score[16];
   int max_score = -1000;
   for (int i = 0; i < 16; ++i) {
@@ -3128,6 +3130,7 @@
   // Calculate scores using the NN model.
   float score[LABELS] = { 0.0f };
   av1_nn_predict(features, nn_config, score);
+  aom_clear_system_state();
   int int_score[LABELS];
   int max_score = -1000;
   for (int i = 0; i < LABELS; ++i) {
@@ -3212,6 +3215,7 @@
   // Calculate score using the NN model.
   float score = 0.0f;
   av1_nn_predict(features, nn_config, &score);
+  aom_clear_system_state();
 
   // Make decision.
   return (int)(score * 100) >= thresh;
diff --git a/av1/encoder/ml.c b/av1/encoder/ml.c
index d21def4..ad664ac 100644
--- a/av1/encoder/ml.c
+++ b/av1/encoder/ml.c
@@ -15,31 +15,31 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/ml.h"
 
-void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
-                    float *output) {
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_c(const float *input_nodes,
+                      const NN_CONFIG *const nn_config, float *const output) {
   int num_input_nodes = nn_config->num_inputs;
   int buf_index = 0;
   float buf[2][NN_MAX_NODES_PER_LAYER];
-  const float *input_nodes = features;
 
   // Propagate hidden layers.
   const int num_layers = nn_config->num_hidden_layers;
   assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
   for (int layer = 0; layer < num_layers; ++layer) {
-    const float *weights = nn_config->weights[layer];
-    const float *bias = nn_config->bias[layer];
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
     float *output_nodes = buf[buf_index];
     const int num_output_nodes = nn_config->num_hidden_nodes[layer];
     assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
     for (int node = 0; node < num_output_nodes; ++node) {
-      float val = 0.0f;
+      float val = layer_bias[node];
       for (int i = 0; i < num_input_nodes; ++i)
-        val += weights[i] * input_nodes[i];
-      val += bias[node];
+        val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
       // ReLU as activation function.
       val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
       output_nodes[node] = val;
-      weights += num_input_nodes;
     }
     num_input_nodes = num_output_nodes;
     input_nodes = output_nodes;
@@ -47,14 +47,13 @@
   }
 
   // Final output layer.
-  const float *weights = nn_config->weights[num_layers];
+  const float *layer_weights = nn_config->weights[num_layers];
+  const float *layer_bias = nn_config->bias[num_layers];
   for (int node = 0; node < nn_config->num_outputs; ++node) {
-    const float *bias = nn_config->bias[num_layers];
-    float val = 0.0f;
+    float val = layer_bias[node];
     for (int i = 0; i < num_input_nodes; ++i)
-      val += weights[i] * input_nodes[i];
-    output[node] = val + bias[node];
-    weights += num_input_nodes;
+      val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+    output[node] = val;
   }
 }
 
diff --git a/av1/encoder/ml.h b/av1/encoder/ml.h
index cb8ef28..7f2750b 100644
--- a/av1/encoder/ml.h
+++ b/av1/encoder/ml.h
@@ -16,10 +16,12 @@
 extern "C" {
 #endif
 
+#include "config/av1_rtcd.h"
+
 #define NN_MAX_HIDDEN_LAYERS 10
 #define NN_MAX_NODES_PER_LAYER 128
 
-typedef struct {
+struct NN_CONFIG {
   int num_inputs;         // Number of input nodes, i.e. features.
   int num_outputs;        // Number of output nodes.
   int num_hidden_layers;  // Number of hidden layers, maximum 10.
@@ -29,13 +31,8 @@
   const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
   // Bias parameters, indexed by layer.
   const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
-} NN_CONFIG;
-
-// Calculate prediction based on the given input features and neural net config.
-// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
-// layer.
-void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
-                    float *output);
+};
+// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs
 
 // Applies the softmax normalization function to the input
 // to get a valid probability distribution in the output:
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 3c1cd8f..5b7747e 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1836,6 +1836,7 @@
                                   &vfeatures[vfeatures_num - 1]);
   av1_nn_predict(hfeatures, nn_config_hor, hscores);
   av1_nn_predict(vfeatures, nn_config_ver, vscores);
+  aom_clear_system_state();
 
   float score_2D_average = 0.0f;
   for (int i = 0; i < 4; i++) {
@@ -2739,10 +2740,10 @@
   float rate_f, dist_by_sse_norm_f;
   av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
   av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
+  aom_clear_system_state();
   const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
   int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
   int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
-  aom_clear_system_state();
 
   // Check if skip is better
   if (rate_i == 0) {
@@ -4813,6 +4814,7 @@
 
   float score = 0.0f;
   av1_nn_predict(features, nn_config, &score);
+  aom_clear_system_state();
   if (score > 8.0f) return 100;
   if (score < -8.0f) return 0;
   score = 1.0f / (1.0f + (float)exp(-score));
diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c
new file mode 100644
index 0000000..c520c3c
--- /dev/null
+++ b/av1/encoder/x86/ml_sse3.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <pmmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+
+// In order to avoid the high-latency of swapping between FPU and SIMD
+// operations, we keep the result in a 128-bit register even though we only
+// care about a single value.
+static void nn_propagate_8to1(const float *const inputs,
+                              const float *const weights,
+                              __m128 *const output) {
+  const __m128 inputs_h = _mm_loadu_ps(&inputs[4]);
+  const __m128 inputs_l = _mm_loadu_ps(inputs);
+
+  const __m128 weights_h = _mm_loadu_ps(&weights[4]);
+  const __m128 weights_l = _mm_loadu_ps(weights);
+
+  const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h);
+  const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l);
+  // [7 6 5 4] [3 2 1 0] (weight and input indices)
+
+  const __m128 vadd = _mm_add_ps(mul_l, mul_h);
+  // [7+3 6+2 5+1 4+0]
+  const __m128 hadd1 = _mm_hadd_ps(vadd, vadd);
+  // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0]
+  const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+  // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0]
+  *output = _mm_add_ps(*output, hadd2);
+}
+
+static void nn_propagate_4to1(const float *const inputs,
+                              const float *const weights,
+                              __m128 *const output) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  const __m128 weights128 = _mm_loadu_ps(weights);
+
+  const __m128 mul = _mm_mul_ps(inputs128, weights128);
+  // [3 2 1 0] (weight and input indices)
+
+  const __m128 hadd1 = _mm_hadd_ps(mul, mul);
+  // [3+2 1+0 3+2 1+0]
+  const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+  // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0]
+  *output = _mm_add_ps(*output, hadd2);
+}
+
+static void nn_propagate_4to4(const float *const inputs,
+                              const float *const weights, __m128 *const outputs,
+                              const int num_inputs) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  __m128 hadd[2];
+  for (int i = 0; i < 2; i++) {  // For each pair of outputs
+    const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+    const __m128 mul0 = _mm_mul_ps(weight0, inputs128);
+    const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+    const __m128 mul1 = _mm_mul_ps(weight1, inputs128);
+    hadd[i] = _mm_hadd_ps(mul0, mul1);
+  }
+  // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+  // hadd[1] = [15+14 13+12 11+10 9+8]
+
+  const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]);
+  // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+
+  *outputs = _mm_add_ps(*outputs, hh);
+}
+
+static void nn_propagate_4to8(const float *const inputs,
+                              const float *const weights, __m128 *const out_h,
+                              __m128 *const out_l, const int num_inputs) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  __m128 hadd[4];
+  for (int i = 0; i < 4; i++) {  // For each pair of outputs
+    const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+    const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+    const __m128 mul0 = _mm_mul_ps(inputs128, weight0);
+    const __m128 mul1 = _mm_mul_ps(inputs128, weight1);
+    hadd[i] = _mm_hadd_ps(mul0, mul1);
+  }
+  // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+  // hadd[1] = [15+14 13+12 11+10 9+8]
+  // hadd[2] = [23+22 21+20 19+18 17+16]
+  // hadd[3] = [31+30 29+28 27+26 25+24]
+
+  const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]);
+  // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+  const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]);
+  // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16]
+
+  *out_h = _mm_add_ps(*out_h, hh1);
+  *out_l = _mm_add_ps(*out_l, hh0);
+}
+
+static void nn_propagate_8to4(const float *const inputs,
+                              const float *const weights, __m128 *const outputs,
+                              const int num_inputs) {
+  const __m128 inputs_h = _mm_loadu_ps(inputs + 4);
+  const __m128 inputs_l = _mm_loadu_ps(inputs);
+  // [7 6 5 4] [3 2 1 0] (input indices)
+
+  __m128 add[4];
+  for (int i = 0; i < 4; i++) {  // For each output:
+    const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]);
+    const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]);
+    const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h);
+    const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l);
+    add[i] = _mm_add_ps(mul_l, mul_h);
+  }
+  // add[0] = [7+3 6+2 5+1 4+0]
+  // add[1] = [15+11 14+10 13+9 12+8]
+  // add[2] = [23+19 22+18 21+17 20+16]
+  // add[3] = [31+27 30+26 29+25 28+24]
+
+  const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]);
+  // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16]
+  const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]);
+  // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0]
+
+  const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h);
+  // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16
+  //  15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0]
+
+  *outputs = _mm_add_ps(*outputs, haddhadd);
+}
+
+static void nn_activate8(__m128 *out_h, __m128 *out_l) {
+  const __m128 zero = _mm_setzero_ps();
+  *out_h = _mm_max_ps(*out_h, zero);
+  *out_l = _mm_max_ps(*out_l, zero);
+}
+
+static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); }
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_sse3(const float *input_nodes,
+                         const NN_CONFIG *const nn_config,
+                         float *const output) {
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  int buf_index = 0;
+  int num_inputs = nn_config->num_inputs;
+
+  // Hidden layers, except the final iteration is the output layer.
+  for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    bool output_layer = (layer == nn_config->num_hidden_layers);
+    float *const output_nodes = output_layer ? output : buf[buf_index];
+    const int num_outputs = output_layer ? nn_config->num_outputs
+                                         : nn_config->num_hidden_nodes[layer];
+
+    if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out += 8) {
+        __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]);
+        __m128 out_l = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          nn_propagate_4to8(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &out_h,
+                            &out_l, num_inputs);
+        }
+        if (!output_layer) nn_activate8(&out_h, &out_l);
+        _mm_storeu_ps(&output_nodes[out + 4], out_h);
+        _mm_storeu_ps(&output_nodes[out], out_l);
+      }
+    } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 8) {
+          nn_propagate_8to4(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &outputs,
+                            num_inputs);
+        }
+        if (!output_layer) nn_activate4(&outputs);
+        _mm_storeu_ps(&output_nodes[out], outputs);
+      }
+    } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          nn_propagate_4to4(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &outputs,
+                            num_inputs);
+        }
+        if (!output_layer) nn_activate4(&outputs);
+        _mm_storeu_ps(&output_nodes[out], outputs);
+      }
+    } else if (num_inputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 8) {
+          nn_propagate_8to1(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &total);
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    } else if (num_inputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          nn_propagate_4to1(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &total);
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    } else {
+      // Use SSE instructions for scalar operations to avoid the latency of
+      // swapping between SIMD and FPU modes.
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in_node = 0; in_node < num_inputs; in_node++) {
+          __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+          __m128 weight =
+              _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+          total = _mm_add_ps(total, _mm_mul_ps(input, weight));
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    }
+    input_nodes = output_nodes;
+    num_inputs = num_outputs;
+    buf_index = 1 - buf_index;
+  }
+}
diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
new file mode 100644
index 0000000..bda8868
--- /dev/null
+++ b/test/av1_nn_predict_test.cc
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/system_state.h"
+#include "av1/encoder/ml.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+typedef void (*NnPredict_Func)(const float *const input_nodes,
+                               const NN_CONFIG *const nn_config,
+                               float *const output);
+
+typedef libaom_test::FuncParam<NnPredict_Func> TestFuncs;
+
+typedef ::testing::tuple<const NnPredict_Func> NnPredictTestParam;
+
+const float epsilon = 1e-3f;  // Error threshold for functional equivalence
+
+class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> {
+ public:
+  virtual void SetUp() {
+    const int MAX_NODES2 = NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+    // Allocate two massive buffers on the heap for edge weights and node bias
+    // Then set-up the double-dimension arrays pointing into the big buffers
+    weights_buf = (float *)aom_malloc(MAX_NODES2 * (NN_MAX_HIDDEN_LAYERS + 1) *
+                                      sizeof(*weights_buf));
+    bias_buf =
+        (float *)aom_malloc(NN_MAX_NODES_PER_LAYER *
+                            (NN_MAX_HIDDEN_LAYERS + 1) * sizeof(*bias_buf));
+    ASSERT_NE(weights_buf, nullptr);
+    ASSERT_NE(bias_buf, nullptr);
+    for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) {
+      weights[i] = &weights_buf[i * MAX_NODES2];
+      bias[i] = &bias_buf[i * NN_MAX_NODES_PER_LAYER];
+    }
+    target_func_ = GET_PARAM(0);
+  }
+  virtual void TearDown() {
+    aom_free(weights_buf);
+    aom_free(bias_buf);
+  }
+  void runNnPredictTest(const NN_CONFIG *const shape);
+  void runNnPredictSpeedTest(const NN_CONFIG *const shape, const int run_times);
+  void runNnPredictTest_all(const NN_CONFIG *const shapes,
+                            const int num_shapes);
+  void runNnPredictSpeedTest_all(const NN_CONFIG *const shapes,
+                                 const int num_shapes, const int run_times);
+  void predict_original(const float *features, const NN_CONFIG *nn_config,
+                        float *output);
+
+ private:
+  NnPredict_Func target_func_;
+  ACMRandom rng_;
+  float *weights[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
+  float *bias[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
+  float *weights_buf = nullptr, *bias_buf = nullptr;
+};
+
+void NnPredictTest::runNnPredictTest(const NN_CONFIG *const shape) {
+  aom_clear_system_state();
+  float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
+  float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
+  float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
+
+  NN_CONFIG nn_config;
+  memcpy(&nn_config, shape, sizeof(NN_CONFIG));
+
+  char shape_str[32] = { 0 };
+  snprintf(shape_str, sizeof(shape_str), "%d", shape->num_inputs);
+  for (int layer = 0; layer < shape->num_hidden_layers; layer++)
+    snprintf(&shape_str[strlen(shape_str)],
+             sizeof(shape_str) - strlen(shape_str), "x%d",
+             shape->num_hidden_nodes[layer]);
+  snprintf(&shape_str[strlen(shape_str)], sizeof(shape_str) - strlen(shape_str),
+           "x%d", shape->num_outputs);
+
+  for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) {
+    nn_config.weights[i] = weights[i];
+    nn_config.bias[i] = bias[i];
+  }
+
+  for (int iter = 0; iter < 10000 && !HasFatalFailure(); ++iter) {
+    for (int node = 0; node < shape->num_inputs; node++) {
+      inputs[node] = (int32_t)rng_.Rand31() - (1 << 30);
+      inputs[node] /= (1 << 31);
+    }
+    for (int layer = 0; layer < shape->num_hidden_layers; layer++) {
+      for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) {
+        bias[layer][node] = (int32_t)rng_.Rand31() - (1 << 30);
+        bias[layer][node] /= (1 << 31);
+      }
+      for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+           node++) {
+        weights[layer][node] = (int32_t)rng_.Rand31() - (1 << 30);
+        weights[layer][node] /= (1 << 31);
+      }
+    }
+    // Now the outputs:
+    int layer = shape->num_hidden_layers;
+    for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) {
+      bias[layer][node] = (int32_t)rng_.Rand31() - (1 << 30);
+      bias[layer][node] /= (1 << 31);
+    }
+    for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+         node++) {
+      weights[layer][node] = (int32_t)rng_.Rand31() - (1 << 30);
+      weights[layer][node] /= (1 << 31);
+    }
+
+    av1_nn_predict_c(inputs, &nn_config, outputs_ref);
+    target_func_(inputs, &nn_config, outputs_test);
+    aom_clear_system_state();
+
+    for (int node = 0; node < shape->num_outputs; node++) {
+      if (outputs_ref[node] < epsilon) {
+        ASSERT_LE(outputs_test[node], epsilon)
+            << "Reference output was near-zero, test output was not ("
+            << shape_str << ")";
+      } else {
+        const float error = outputs_ref[node] - outputs_test[node];
+        const float relative_error = fabs(error / outputs_ref[node]);
+        ASSERT_LE(relative_error, epsilon)
+            << "Excessive relative error between reference and test ("
+            << shape_str << ")";
+      }
+    }
+  }
+  //  printf("Network shape %s passed\n", shape_str);
+}
+
+void NnPredictTest::runNnPredictSpeedTest(const NN_CONFIG *const shape,
+                                          const int run_times) {
+  aom_clear_system_state();
+  float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
+  float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
+  float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
+
+  NN_CONFIG nn_config;
+  memcpy(&nn_config, shape, sizeof(NN_CONFIG));
+
+  for (int i = 0; i < NN_MAX_HIDDEN_LAYERS; i++) {
+    nn_config.weights[i] = weights[i];
+    nn_config.bias[i] = bias[i];
+  }
+  // Don't bother actually changing the values for inputs/weights/bias: it
+  // shouldn't make any difference for a speed test.
+
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_nn_predict_c(inputs, &nn_config, outputs_ref);
+  }
+  aom_usec_timer_mark(&timer);
+  const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    target_func_(inputs, &nn_config, outputs_test);
+  }
+  aom_usec_timer_mark(&timer);
+  aom_clear_system_state();
+  const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+  printf("%d", shape->num_inputs);
+  for (int layer = 0; layer < shape->num_hidden_layers; layer++)
+    printf("x%d", shape->num_hidden_nodes[layer]);
+  printf("x%d: ", shape->num_outputs);
+  printf("%7.2f/%7.2fns (%3.2f)\n", time1, time2, time1 / time2);
+}
+
+// This is all the neural network shapes observed executed in a few different
+// runs of the encoder.  It also conveniently covers all the kernels
+// implemented.
+static const NN_CONFIG shapes[] = {
+  { .num_inputs = 10,
+    .num_outputs = 16,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 64 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 12,
+    .num_outputs = 1,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 12 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 12,
+    .num_outputs = 1,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 24 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 12,
+    .num_outputs = 1,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 32 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 18,
+    .num_outputs = 4,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 24 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 18,
+    .num_outputs = 4,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 32 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 4,
+    .num_outputs = 1,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 16 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 8,
+    .num_outputs = 1,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 16 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 8,
+    .num_outputs = 4,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 16 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 8,
+    .num_outputs = 1,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 24 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 8,
+    .num_outputs = 1,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 32 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 8,
+    .num_outputs = 1,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 64 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 9,
+    .num_outputs = 3,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 32 },
+    .weights = { 0 },
+    .bias = { 0 } },
+  { .num_inputs = 4,
+    .num_outputs = 4,
+    .num_hidden_layers = 1,
+    .num_hidden_nodes = { 8 },
+    .weights = { 0 },
+    .bias = { 0 } },
+};
+
+void NnPredictTest::runNnPredictTest_all(const NN_CONFIG *const shapes,
+                                         const int num_shapes) {
+  for (int i = 0; i < num_shapes; i++) runNnPredictTest(&shapes[i]);
+}
+
+void NnPredictTest::runNnPredictSpeedTest_all(const NN_CONFIG *const shapes,
+                                              const int num_shapes,
+                                              const int run_times) {
+  for (int i = 0; i < num_shapes; i++)
+    NnPredictTest::runNnPredictSpeedTest(&shapes[i], run_times);
+}
+
+TEST_P(NnPredictTest, RandomValues) {
+  runNnPredictTest_all(shapes, sizeof(shapes) / sizeof(NN_CONFIG));
+}
+
+TEST_P(NnPredictTest, DISABLED_Speed) {
+  runNnPredictSpeedTest_all(shapes, sizeof(shapes) / sizeof(NN_CONFIG),
+                            10000000);
+}
+
+#if HAVE_SSE3
+INSTANTIATE_TEST_CASE_P(SSE3, NnPredictTest,
+                        ::testing::Values(av1_nn_predict_sse3));
+#endif
+
+}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index d15e580..12f2319 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -170,6 +170,7 @@
               "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
               "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
               "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
+              "${AOM_ROOT}/test/av1_nn_predict_test.cc"
               "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
               "${AOM_ROOT}/test/av1_txfm_test.cc"
               "${AOM_ROOT}/test/av1_txfm_test.h"