SSE3-optimised av1_nn_predict I have developed a SIMD-optimised neural network implementation using SSE3. I have also added functional equivalence tests between this and the original implementation. I added aom_clear_system_state() to a few places where FPU operations are used after av1_nn_predict. Speed-ups over the original C implementation for various network shapes: 10x64x16: 1.72x 12x12x1: 2.72x 12x24x1: 2.35x 12x32x1: 3.34x 18x24x4: 0.94x 18x32x4: 0.93x 4x16x1: 2.01x 8x16x1: 1.89x 8x16x4: 2.02x 8x24x1: 2.77x 8x32x1: 2.98x 8x64x1: 3.76x 9x32x3: 1.08x 4x8x4: 1.66x A few awkwardly-shaped networks are slightly slower: these could be padded to more convenient sizes to use the SIMD kernels. I also wrote an AVX/AVX2 implementation but on these relatively small networks it was barely faster than the SSE3 code. Change-Id: I6a72be12cb7df8cf946578c3e01b21a439377d45
diff --git a/av1/av1.cmake b/av1/av1.cmake index 7976746..8c92615 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -264,6 +264,8 @@ "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c" "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c") +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c") + list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm") @@ -382,6 +384,14 @@ endif() endif() + if(HAVE_SSE3) + require_compiler_flag_nomsvc("-msse3" NO) + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-msse3" "sse3" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE3" "aom") + endif() + endif() + if(HAVE_SSSE3) require_compiler_flag_nomsvc("-mssse3" NO) add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index e38f897..c167c65 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -33,6 +33,8 @@ struct aom_variance_vtable; struct search_site_config; struct yv12_buffer_config; +struct NN_CONFIG; +typedef struct NN_CONFIG NN_CONFIG; /* Function pointers return by CfL functions */ typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride, @@ -308,6 +310,9 @@ add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr"; specialize qw/av1_get_horver_correlation_full sse4_1 avx2/; + + add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, float *const output"; + specialize qw/av1_nn_predict sse3/; } # end encoder functions
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c index d5833f2..e5b7120 100644 --- a/av1/encoder/encodeframe.c +++ b/av1/encoder/encodeframe.c
@@ -2895,6 +2895,7 @@ // 2. Do the prediction and prune 0-2 partitions based on their probabilities float raw_scores[3] = { 0.0f }; av1_nn_predict(features, nn_config, raw_scores); + aom_clear_system_state(); float probs[3] = { 0.0f }; av1_nn_softmax(raw_scores, probs, 3); @@ -2962,6 +2963,7 @@ // Calculate scores using the NN model. float score[16] = { 0.0f }; av1_nn_predict(features, nn_config, score); + aom_clear_system_state(); int int_score[16]; int max_score = -1000; for (int i = 0; i < 16; ++i) { @@ -3128,6 +3130,7 @@ // Calculate scores using the NN model. float score[LABELS] = { 0.0f }; av1_nn_predict(features, nn_config, score); + aom_clear_system_state(); int int_score[LABELS]; int max_score = -1000; for (int i = 0; i < LABELS; ++i) { @@ -3212,6 +3215,7 @@ // Calculate score using the NN model. float score = 0.0f; av1_nn_predict(features, nn_config, &score); + aom_clear_system_state(); // Make decision. return (int)(score * 100) >= thresh;
diff --git a/av1/encoder/ml.c b/av1/encoder/ml.c index d21def4..ad664ac 100644 --- a/av1/encoder/ml.c +++ b/av1/encoder/ml.c
@@ -15,31 +15,31 @@ #include "aom_dsp/aom_dsp_common.h" #include "av1/encoder/ml.h" -void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, - float *output) { +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_c(const float *input_nodes, + const NN_CONFIG *const nn_config, float *const output) { int num_input_nodes = nn_config->num_inputs; int buf_index = 0; float buf[2][NN_MAX_NODES_PER_LAYER]; - const float *input_nodes = features; // Propagate hidden layers. const int num_layers = nn_config->num_hidden_layers; assert(num_layers <= NN_MAX_HIDDEN_LAYERS); for (int layer = 0; layer < num_layers; ++layer) { - const float *weights = nn_config->weights[layer]; - const float *bias = nn_config->bias[layer]; + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; float *output_nodes = buf[buf_index]; const int num_output_nodes = nn_config->num_hidden_nodes[layer]; assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); for (int node = 0; node < num_output_nodes; ++node) { - float val = 0.0f; + float val = layer_bias[node]; for (int i = 0; i < num_input_nodes; ++i) - val += weights[i] * input_nodes[i]; - val += bias[node]; + val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; // ReLU as activation function. val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). output_nodes[node] = val; - weights += num_input_nodes; } num_input_nodes = num_output_nodes; input_nodes = output_nodes; @@ -47,14 +47,13 @@ } // Final output layer. - const float *weights = nn_config->weights[num_layers]; + const float *layer_weights = nn_config->weights[num_layers]; + const float *layer_bias = nn_config->bias[num_layers]; for (int node = 0; node < nn_config->num_outputs; ++node) { - const float *bias = nn_config->bias[num_layers]; - float val = 0.0f; + float val = layer_bias[node]; for (int i = 0; i < num_input_nodes; ++i) - val += weights[i] * input_nodes[i]; - output[node] = val + bias[node]; - weights += num_input_nodes; + val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; + output[node] = val; } }
diff --git a/av1/encoder/ml.h b/av1/encoder/ml.h index cb8ef28..7f2750b 100644 --- a/av1/encoder/ml.h +++ b/av1/encoder/ml.h
@@ -16,10 +16,12 @@ extern "C" { #endif +#include "config/av1_rtcd.h" + #define NN_MAX_HIDDEN_LAYERS 10 #define NN_MAX_NODES_PER_LAYER 128 -typedef struct { +struct NN_CONFIG { int num_inputs; // Number of input nodes, i.e. features. int num_outputs; // Number of output nodes. int num_hidden_layers; // Number of hidden layers, maximum 10. @@ -29,13 +31,8 @@ const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; // Bias parameters, indexed by layer. const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; -} NN_CONFIG; - -// Calculate prediction based on the given input features and neural net config. -// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden -// layer. -void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, - float *output); +}; +// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs // Applies the softmax normalization function to the input // to get a valid probability distribution in the output:
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c index 3c1cd8f..5b7747e 100644 --- a/av1/encoder/rdopt.c +++ b/av1/encoder/rdopt.c
@@ -1836,6 +1836,7 @@ &vfeatures[vfeatures_num - 1]); av1_nn_predict(hfeatures, nn_config_hor, hscores); av1_nn_predict(vfeatures, nn_config_ver, vscores); + aom_clear_system_state(); float score_2D_average = 0.0f; for (int i = 0; i < 4; i++) { @@ -2739,10 +2740,10 @@ float rate_f, dist_by_sse_norm_f; av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f); av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f); + aom_clear_system_state(); const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm)); int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); - aom_clear_system_state(); // Check if skip is better if (rate_i == 0) { @@ -4813,6 +4814,7 @@ float score = 0.0f; av1_nn_predict(features, nn_config, &score); + aom_clear_system_state(); if (score > 8.0f) return 100; if (score < -8.0f) return 0; score = 1.0f / (1.0f + (float)exp(-score));
diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c new file mode 100644 index 0000000..c520c3c --- /dev/null +++ b/av1/encoder/x86/ml_sse3.c
@@ -0,0 +1,243 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdbool.h> +#include <assert.h> +#include <pmmintrin.h> + +#include "config/av1_rtcd.h" +#include "av1/encoder/ml.h" + +// In order to avoid the high-latency of swapping between FPU and SIMD +// operations, we keep the result in a 128-bit register even though we only +// care about a single value. +static void nn_propagate_8to1(const float *const inputs, + const float *const weights, + __m128 *const output) { + const __m128 inputs_h = _mm_loadu_ps(&inputs[4]); + const __m128 inputs_l = _mm_loadu_ps(inputs); + + const __m128 weights_h = _mm_loadu_ps(&weights[4]); + const __m128 weights_l = _mm_loadu_ps(weights); + + const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h); + const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l); + // [7 6 5 4] [3 2 1 0] (weight and input indices) + + const __m128 vadd = _mm_add_ps(mul_l, mul_h); + // [7+3 6+2 5+1 4+0] + const __m128 hadd1 = _mm_hadd_ps(vadd, vadd); + // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0] + const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); + // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0] + *output = _mm_add_ps(*output, hadd2); +} + +static void nn_propagate_4to1(const float *const inputs, + const float *const weights, + __m128 *const output) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + const __m128 weights128 = _mm_loadu_ps(weights); + + const __m128 mul = _mm_mul_ps(inputs128, weights128); + // [3 2 1 0] (weight and input indices) + + const __m128 hadd1 = _mm_hadd_ps(mul, mul); + // [3+2 1+0 3+2 1+0] + const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); + // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0] + *output = _mm_add_ps(*output, hadd2); +} + +static void nn_propagate_4to4(const float *const inputs, + const float *const weights, __m128 *const outputs, + const int num_inputs) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + __m128 hadd[2]; + for (int i = 0; i < 2; i++) { // For each pair of outputs + const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); + const __m128 mul0 = _mm_mul_ps(weight0, inputs128); + const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); + const __m128 mul1 = _mm_mul_ps(weight1, inputs128); + hadd[i] = _mm_hadd_ps(mul0, mul1); + } + // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) + // hadd[1] = [15+14 13+12 11+10 9+8] + + const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]); + // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] + + *outputs = _mm_add_ps(*outputs, hh); +} + +static void nn_propagate_4to8(const float *const inputs, + const float *const weights, __m128 *const out_h, + __m128 *const out_l, const int num_inputs) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + __m128 hadd[4]; + for (int i = 0; i < 4; i++) { // For each pair of outputs + const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); + const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); + const __m128 mul0 = _mm_mul_ps(inputs128, weight0); + const __m128 mul1 = _mm_mul_ps(inputs128, weight1); + hadd[i] = _mm_hadd_ps(mul0, mul1); + } + // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) + // hadd[1] = [15+14 13+12 11+10 9+8] + // hadd[2] = [23+22 21+20 19+18 17+16] + // hadd[3] = [31+30 29+28 27+26 25+24] + + const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]); + // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] + const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]); + // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16] + + *out_h = _mm_add_ps(*out_h, hh1); + *out_l = _mm_add_ps(*out_l, hh0); +} + +static void nn_propagate_8to4(const float *const inputs, + const float *const weights, __m128 *const outputs, + const int num_inputs) { + const __m128 inputs_h = _mm_loadu_ps(inputs + 4); + const __m128 inputs_l = _mm_loadu_ps(inputs); + // [7 6 5 4] [3 2 1 0] (input indices) + + __m128 add[4]; + for (int i = 0; i < 4; i++) { // For each output: + const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]); + const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]); + const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h); + const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l); + add[i] = _mm_add_ps(mul_l, mul_h); + } + // add[0] = [7+3 6+2 5+1 4+0] + // add[1] = [15+11 14+10 13+9 12+8] + // add[2] = [23+19 22+18 21+17 20+16] + // add[3] = [31+27 30+26 29+25 28+24] + + const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]); + // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16] + const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]); + // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0] + + const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h); + // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16 + // 15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0] + + *outputs = _mm_add_ps(*outputs, haddhadd); +} + +static void nn_activate8(__m128 *out_h, __m128 *out_l) { + const __m128 zero = _mm_setzero_ps(); + *out_h = _mm_max_ps(*out_h, zero); + *out_l = _mm_max_ps(*out_l, zero); +} + +static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); } + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_sse3(const float *input_nodes, + const NN_CONFIG *const nn_config, + float *const output) { + float buf[2][NN_MAX_NODES_PER_LAYER]; + int buf_index = 0; + int num_inputs = nn_config->num_inputs; + + // Hidden layers, except the final iteration is the output layer. + for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + bool output_layer = (layer == nn_config->num_hidden_layers); + float *const output_nodes = output_layer ? output : buf[buf_index]; + const int num_outputs = output_layer ? nn_config->num_outputs + : nn_config->num_hidden_nodes[layer]; + + if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { + for (int out = 0; out < num_outputs; out += 8) { + __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]); + __m128 out_l = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + nn_propagate_4to8(&input_nodes[in], + &layer_weights[out * num_inputs + in], &out_h, + &out_l, num_inputs); + } + if (!output_layer) nn_activate8(&out_h, &out_l); + _mm_storeu_ps(&output_nodes[out + 4], out_h); + _mm_storeu_ps(&output_nodes[out], out_l); + } + } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 8) { + nn_propagate_8to4(&input_nodes[in], + &layer_weights[out * num_inputs + in], &outputs, + num_inputs); + } + if (!output_layer) nn_activate4(&outputs); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + nn_propagate_4to4(&input_nodes[in], + &layer_weights[out * num_inputs + in], &outputs, + num_inputs); + } + if (!output_layer) nn_activate4(&outputs); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (num_inputs % 8 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 8) { + nn_propagate_8to1(&input_nodes[in], + &layer_weights[out * num_inputs + in], &total); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } else if (num_inputs % 4 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + nn_propagate_4to1(&input_nodes[in], + &layer_weights[out * num_inputs + in], &total); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } else { + // Use SSE instructions for scalar operations to avoid the latency of + // swapping between SIMD and FPU modes. + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in_node = 0; in_node < num_inputs; in_node++) { + __m128 input = _mm_load1_ps(&input_nodes[in_node]); + __m128 weight = + _mm_load1_ps(&layer_weights[num_inputs * out + in_node]); + total = _mm_add_ps(total, _mm_mul_ps(input, weight)); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } + input_nodes = output_nodes; + num_inputs = num_outputs; + buf_index = 1 - buf_index; + } +}
diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc new file mode 100644 index 0000000..bda8868 --- /dev/null +++ b/test/av1_nn_predict_test.cc
@@ -0,0 +1,307 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <vector> + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/function_equivalence_test.h" +#include "test/register_state_check.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/system_state.h" +#include "av1/encoder/ml.h" + +using libaom_test::FunctionEquivalenceTest; + +namespace { +typedef void (*NnPredict_Func)(const float *const input_nodes, + const NN_CONFIG *const nn_config, + float *const output); + +typedef libaom_test::FuncParam<NnPredict_Func> TestFuncs; + +typedef ::testing::tuple<const NnPredict_Func> NnPredictTestParam; + +const float epsilon = 1e-3f; // Error threshold for functional equivalence + +class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> { + public: + virtual void SetUp() { + const int MAX_NODES2 = NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER; + // Allocate two massive buffers on the heap for edge weights and node bias + // Then set-up the double-dimension arrays pointing into the big buffers + weights_buf = (float *)aom_malloc(MAX_NODES2 * (NN_MAX_HIDDEN_LAYERS + 1) * + sizeof(*weights_buf)); + bias_buf = + (float *)aom_malloc(NN_MAX_NODES_PER_LAYER * + (NN_MAX_HIDDEN_LAYERS + 1) * sizeof(*bias_buf)); + ASSERT_NE(weights_buf, nullptr); + ASSERT_NE(bias_buf, nullptr); + for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) { + weights[i] = &weights_buf[i * MAX_NODES2]; + bias[i] = &bias_buf[i * NN_MAX_NODES_PER_LAYER]; + } + target_func_ = GET_PARAM(0); + } + virtual void TearDown() { + aom_free(weights_buf); + aom_free(bias_buf); + } + void runNnPredictTest(const NN_CONFIG *const shape); + void runNnPredictSpeedTest(const NN_CONFIG *const shape, const int run_times); + void runNnPredictTest_all(const NN_CONFIG *const shapes, + const int num_shapes); + void runNnPredictSpeedTest_all(const NN_CONFIG *const shapes, + const int num_shapes, const int run_times); + void predict_original(const float *features, const NN_CONFIG *nn_config, + float *output); + + private: + NnPredict_Func target_func_; + ACMRandom rng_; + float *weights[NN_MAX_HIDDEN_LAYERS + 1] = { 0 }; + float *bias[NN_MAX_HIDDEN_LAYERS + 1] = { 0 }; + float *weights_buf = nullptr, *bias_buf = nullptr; +}; + +void NnPredictTest::runNnPredictTest(const NN_CONFIG *const shape) { + aom_clear_system_state(); + float inputs[NN_MAX_NODES_PER_LAYER] = { 0 }; + float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 }; + float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 }; + + NN_CONFIG nn_config; + memcpy(&nn_config, shape, sizeof(NN_CONFIG)); + + char shape_str[32] = { 0 }; + snprintf(shape_str, sizeof(shape_str), "%d", shape->num_inputs); + for (int layer = 0; layer < shape->num_hidden_layers; layer++) + snprintf(&shape_str[strlen(shape_str)], + sizeof(shape_str) - strlen(shape_str), "x%d", + shape->num_hidden_nodes[layer]); + snprintf(&shape_str[strlen(shape_str)], sizeof(shape_str) - strlen(shape_str), + "x%d", shape->num_outputs); + + for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) { + nn_config.weights[i] = weights[i]; + nn_config.bias[i] = bias[i]; + } + + for (int iter = 0; iter < 10000 && !HasFatalFailure(); ++iter) { + for (int node = 0; node < shape->num_inputs; node++) { + inputs[node] = (int32_t)rng_.Rand31() - (1 << 30); + inputs[node] /= (1 << 31); + } + for (int layer = 0; layer < shape->num_hidden_layers; layer++) { + for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) { + bias[layer][node] = (int32_t)rng_.Rand31() - (1 << 30); + bias[layer][node] /= (1 << 31); + } + for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER; + node++) { + weights[layer][node] = (int32_t)rng_.Rand31() - (1 << 30); + weights[layer][node] /= (1 << 31); + } + } + // Now the outputs: + int layer = shape->num_hidden_layers; + for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) { + bias[layer][node] = (int32_t)rng_.Rand31() - (1 << 30); + bias[layer][node] /= (1 << 31); + } + for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER; + node++) { + weights[layer][node] = (int32_t)rng_.Rand31() - (1 << 30); + weights[layer][node] /= (1 << 31); + } + + av1_nn_predict_c(inputs, &nn_config, outputs_ref); + target_func_(inputs, &nn_config, outputs_test); + aom_clear_system_state(); + + for (int node = 0; node < shape->num_outputs; node++) { + if (outputs_ref[node] < epsilon) { + ASSERT_LE(outputs_test[node], epsilon) + << "Reference output was near-zero, test output was not (" + << shape_str << ")"; + } else { + const float error = outputs_ref[node] - outputs_test[node]; + const float relative_error = fabs(error / outputs_ref[node]); + ASSERT_LE(relative_error, epsilon) + << "Excessive relative error between reference and test (" + << shape_str << ")"; + } + } + } + // printf("Network shape %s passed\n", shape_str); +} + +void NnPredictTest::runNnPredictSpeedTest(const NN_CONFIG *const shape, + const int run_times) { + aom_clear_system_state(); + float inputs[NN_MAX_NODES_PER_LAYER] = { 0 }; + float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 }; + float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 }; + + NN_CONFIG nn_config; + memcpy(&nn_config, shape, sizeof(NN_CONFIG)); + + for (int i = 0; i < NN_MAX_HIDDEN_LAYERS; i++) { + nn_config.weights[i] = weights[i]; + nn_config.bias[i] = bias[i]; + } + // Don't bother actually changing the values for inputs/weights/bias: it + // shouldn't make any difference for a speed test. + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + av1_nn_predict_c(inputs, &nn_config, outputs_ref); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + target_func_(inputs, &nn_config, outputs_test); + } + aom_usec_timer_mark(&timer); + aom_clear_system_state(); + const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + + printf("%d", shape->num_inputs); + for (int layer = 0; layer < shape->num_hidden_layers; layer++) + printf("x%d", shape->num_hidden_nodes[layer]); + printf("x%d: ", shape->num_outputs); + printf("%7.2f/%7.2fns (%3.2f)\n", time1, time2, time1 / time2); +} + +// This is all the neural network shapes observed executed in a few different +// runs of the encoder. It also conveniently covers all the kernels +// implemented. +static const NN_CONFIG shapes[] = { + { .num_inputs = 10, + .num_outputs = 16, + .num_hidden_layers = 1, + .num_hidden_nodes = { 64 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 12, + .num_outputs = 1, + .num_hidden_layers = 1, + .num_hidden_nodes = { 12 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 12, + .num_outputs = 1, + .num_hidden_layers = 1, + .num_hidden_nodes = { 24 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 12, + .num_outputs = 1, + .num_hidden_layers = 1, + .num_hidden_nodes = { 32 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 18, + .num_outputs = 4, + .num_hidden_layers = 1, + .num_hidden_nodes = { 24 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 18, + .num_outputs = 4, + .num_hidden_layers = 1, + .num_hidden_nodes = { 32 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 4, + .num_outputs = 1, + .num_hidden_layers = 1, + .num_hidden_nodes = { 16 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 8, + .num_outputs = 1, + .num_hidden_layers = 1, + .num_hidden_nodes = { 16 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 8, + .num_outputs = 4, + .num_hidden_layers = 1, + .num_hidden_nodes = { 16 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 8, + .num_outputs = 1, + .num_hidden_layers = 1, + .num_hidden_nodes = { 24 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 8, + .num_outputs = 1, + .num_hidden_layers = 1, + .num_hidden_nodes = { 32 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 8, + .num_outputs = 1, + .num_hidden_layers = 1, + .num_hidden_nodes = { 64 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 9, + .num_outputs = 3, + .num_hidden_layers = 1, + .num_hidden_nodes = { 32 }, + .weights = { 0 }, + .bias = { 0 } }, + { .num_inputs = 4, + .num_outputs = 4, + .num_hidden_layers = 1, + .num_hidden_nodes = { 8 }, + .weights = { 0 }, + .bias = { 0 } }, +}; + +void NnPredictTest::runNnPredictTest_all(const NN_CONFIG *const shapes, + const int num_shapes) { + for (int i = 0; i < num_shapes; i++) runNnPredictTest(&shapes[i]); +} + +void NnPredictTest::runNnPredictSpeedTest_all(const NN_CONFIG *const shapes, + const int num_shapes, + const int run_times) { + for (int i = 0; i < num_shapes; i++) + NnPredictTest::runNnPredictSpeedTest(&shapes[i], run_times); +} + +TEST_P(NnPredictTest, RandomValues) { + runNnPredictTest_all(shapes, sizeof(shapes) / sizeof(NN_CONFIG)); +} + +TEST_P(NnPredictTest, DISABLED_Speed) { + runNnPredictSpeedTest_all(shapes, sizeof(shapes) / sizeof(NN_CONFIG), + 10000000); +} + +#if HAVE_SSE3 +INSTANTIATE_TEST_CASE_P(SSE3, NnPredictTest, + ::testing::Values(av1_nn_predict_sse3)); +#endif + +} // namespace
diff --git a/test/test.cmake b/test/test.cmake index d15e580..12f2319 100644 --- a/test/test.cmake +++ b/test/test.cmake
@@ -170,6 +170,7 @@ "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc" "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc" "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc" + "${AOM_ROOT}/test/av1_nn_predict_test.cc" "${AOM_ROOT}/test/av1_round_shift_array_test.cc" "${AOM_ROOT}/test/av1_txfm_test.cc" "${AOM_ROOT}/test/av1_txfm_test.h"