Add comments to AVX2 variant of av1_cnn_convolve module

Added comments in av1_cnn_convolve_no_maxpool_padding_valid_avx2() function.

BUG=aomedia:2867

Change-Id: Ic6e860a2e88f136a0bf7b67860e515c1673daf54
diff --git a/av1/encoder/x86/cnn_avx2.c b/av1/encoder/x86/cnn_avx2.c
index 1974d33..0c4b305 100644
--- a/av1/encoder/x86/cnn_avx2.c
+++ b/av1/encoder/x86/cnn_avx2.c
@@ -376,6 +376,14 @@
   const int kSkipHeight = 2;
   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
     __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+    // out_accum registers are used to store the 2x2 convolve outputs
+    // (calculated over input block size), which are accumulated across the
+    // in_channels. As per the design, each iteration of for loop processes 8
+    // (horizontal) 2x2 blocks and stores in corresponding out_accum register
+    // (as input size is 16x16, a total of 64 2x2 blocks are present and 8
+    // out_accum registers are enough to store the outputs).
+    // Hence for loops corresponding to 'j' and 'h', below, run over the number
+    // of out_accum registers.
     __m256 out_accum[8];
     for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg;
     for (int k = 0; k < layer_config->in_channels; ++k) {
@@ -395,9 +403,8 @@
       }
     }
     // Store output of layer 1.
-    for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
-         h += kSkipHeight, ++u) {
-      _mm256_storeu_ps(&output[i][u * out_stride], out_accum[u]);
+    for (int j = 0; j < 8; ++j) {
+      _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]);
     }
   }
 }
@@ -416,6 +423,14 @@
   const int kSkipHeight = 2;
   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
     __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+    // out_accum registers are used to store the 2x2 convolve outputs
+    // (calculated over input block size), which are accumulated across the
+    // in_channels. As per the design, each iteration of for loop processes 8
+    // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding
+    // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are
+    // present and 2 out_accum registers are enough to store the outputs).
+    // Hence for loops corresponding to 'j' and 'h', below, run over the number
+    // of out_accum registers.
     __m256 out_accum[2];
 
     // Height needs to be moved to go to next iteration of processing
@@ -440,9 +455,8 @@
       }
     }
     // Store output of layer 2.
-    for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
-         h += kSkipHeightForNextIter, ++u) {
-      _mm256_storeu_ps(&output[i][u * out_stride * 2], out_accum[u]);
+    for (int j = 0; j < 2; ++j) {
+      _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]);
     }
   }
 }