SSE3-optimised av1_nn_predict I have developed a SIMD-optimised neural network implementation using SSE3. I have also added functional equivalence tests between this and the original implementation. I added aom_clear_system_state() to a few places where FPU operations are used after av1_nn_predict. Speed-ups over the original C implementation for various network shapes: 10x64x16: 1.72x 12x12x1: 2.72x 12x24x1: 2.35x 12x32x1: 3.34x 18x24x4: 0.94x 18x32x4: 0.93x 4x16x1: 2.01x 8x16x1: 1.89x 8x16x4: 2.02x 8x24x1: 2.77x 8x32x1: 2.98x 8x64x1: 3.76x 9x32x3: 1.08x 4x8x4: 1.66x A few awkwardly-shaped networks are slightly slower: these could be padded to more convenient sizes to use the SIMD kernels. I also wrote an AVX/AVX2 implementation but on these relatively small networks it was barely faster than the SSE3 code. Change-Id: I6a72be12cb7df8cf946578c3e01b21a439377d45

commit: 486cc9894b7e76b09b4ee37dff6f313f27b1c501 [log] [tgz]
author: David Turner <david.turner@argondesign.com> Fri Nov 09 15:48:58 2018 +0000
committer: Debargha Mukherjee <debargha@google.com> Sat Nov 10 22:50:00 2018 +0000
tree: 5fbee154d908ccc551811db5d44890225b9972c5
parent: 03d8ebedcddb17eb6f8eae993a09413c67097ac8 [diff] [blame]
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e38f897..c167c65 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -33,6 +33,8 @@
 struct aom_variance_vtable;
 struct search_site_config;
 struct yv12_buffer_config;
+struct NN_CONFIG;
+typedef struct NN_CONFIG NN_CONFIG;
 
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
@@ -308,6 +310,9 @@
 
   add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
+
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, float *const output";
+  specialize qw/av1_nn_predict sse3/;
 }
 # end encoder functions
commit	486cc9894b7e76b09b4ee37dff6f313f27b1c501	[log] [tgz]
author	David Turner <david.turner@argondesign.com>	Fri Nov 09 15:48:58 2018 +0000
committer	Debargha Mukherjee <debargha@google.com>	Sat Nov 10 22:50:00 2018 +0000
tree	5fbee154d908ccc551811db5d44890225b9972c5
parent	03d8ebedcddb17eb6f8eae993a09413c67097ac8 [diff] [blame]