Add comments to AVX2 variant of av1_cnn_convolve module
Added comments in av1_cnn_convolve_no_maxpool_padding_valid_avx2() function.
BUG=aomedia:2867
Change-Id: Ic6e860a2e88f136a0bf7b67860e515c1673daf54
diff --git a/av1/encoder/x86/cnn_avx2.c b/av1/encoder/x86/cnn_avx2.c
index 1974d33..0c4b305 100644
--- a/av1/encoder/x86/cnn_avx2.c
+++ b/av1/encoder/x86/cnn_avx2.c
@@ -376,6 +376,14 @@
const int kSkipHeight = 2;
for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
__m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+ // out_accum registers are used to store the 2x2 convolve outputs
+ // (calculated over input block size), which are accumulated across the
+ // in_channels. As per the design, each iteration of for loop processes 8
+ // (horizontal) 2x2 blocks and stores in corresponding out_accum register
+ // (as input size is 16x16, a total of 64 2x2 blocks are present and 8
+ // out_accum registers are enough to store the outputs).
+ // Hence for loops corresponding to 'j' and 'h', below, run over the number
+ // of out_accum registers.
__m256 out_accum[8];
for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg;
for (int k = 0; k < layer_config->in_channels; ++k) {
@@ -395,9 +403,8 @@
}
}
// Store output of layer 1.
- for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
- h += kSkipHeight, ++u) {
- _mm256_storeu_ps(&output[i][u * out_stride], out_accum[u]);
+ for (int j = 0; j < 8; ++j) {
+ _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]);
}
}
}
@@ -416,6 +423,14 @@
const int kSkipHeight = 2;
for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
__m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+ // out_accum registers are used to store the 2x2 convolve outputs
+ // (calculated over input block size), which are accumulated across the
+ // in_channels. As per the design, each iteration of for loop processes 8
+ // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding
+ // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are
+ // present and 2 out_accum registers are enough to store the outputs).
+ // Hence for loops corresponding to 'j' and 'h', below, run over the number
+ // of out_accum registers.
__m256 out_accum[2];
// Height needs to be moved to go to next iteration of processing
@@ -440,9 +455,8 @@
}
}
// Store output of layer 2.
- for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
- h += kSkipHeightForNextIter, ++u) {
- _mm256_storeu_ps(&output[i][u * out_stride * 2], out_accum[u]);
+ for (int j = 0; j < 2; ++j) {
+ _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]);
}
}
}