SSE3-optimised av1_nn_predict

I have developed a SIMD-optimised neural network implementation using
SSE3.  I have also added functional equivalence tests between this and
the original implementation.  I added aom_clear_system_state() to a few
places where FPU operations are used after av1_nn_predict.

Speed-ups over the original C implementation for various network shapes:
10x64x16: 1.72x
12x12x1:  2.72x
12x24x1:  2.35x
12x32x1:  3.34x
18x24x4:  0.94x
18x32x4:  0.93x
4x16x1:   2.01x
8x16x1:   1.89x
8x16x4:   2.02x
8x24x1:   2.77x
8x32x1:   2.98x
8x64x1:   3.76x
9x32x3:   1.08x
4x8x4:    1.66x

A few awkwardly-shaped networks are slightly slower: these could be
padded to more convenient sizes to use the SIMD kernels.

I also wrote an AVX/AVX2 implementation but on these relatively small
networks it was barely faster than the SSE3 code.

Change-Id: I6a72be12cb7df8cf946578c3e01b21a439377d45
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e38f897..c167c65 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -33,6 +33,8 @@
 struct aom_variance_vtable;
 struct search_site_config;
 struct yv12_buffer_config;
+struct NN_CONFIG;
+typedef struct NN_CONFIG NN_CONFIG;
 
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
@@ -308,6 +310,9 @@
 
   add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
+
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, float *const output";
+  specialize qw/av1_nn_predict sse3/;
 }
 # end encoder functions