Lowbd parallel_deblocking sse2 optimization

Baseline + parallel_deblocking:

- Passed unit tests *SSE2/Loop8Test6*, *AVX2/Loop8Test6*.
- 1080p, 25 frames, profile=0, encoding/decoding, output match.
- Decoder frame rate increases from 54.15 to 65.84.

Change-Id: I55938c94961066594f4b9080192c7268c19d9bf9
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4828345..b7f9b6b 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -65,12 +65,17 @@
 
 set(AOM_DSP_COMMON_INTRIN_AVX2
     "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
     "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h"
     "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h")
 
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_AVX2
+      ${AOM_DSP_COMMON_INTRIN_AVX2}
+      "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c")
+endif ()
+
 set(AOM_DSP_COMMON_ASM_NEON
     "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm"
     "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm"
@@ -85,12 +90,17 @@
     "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm"
     "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm"
     "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm"
     "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm")
 
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_ASM_NEON
+      ${AOM_DSP_COMMON_ASM_NEON}
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm")
+endif ()
+
 set(AOM_DSP_COMMON_INTRIN_NEON
     "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
@@ -98,13 +108,18 @@
     "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
 
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_NEON
+      ${AOM_DSP_COMMON_INTRIN_NEON}
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c")
+endif ()
+
 if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
   set(AOM_DSP_COMMON_INTRIN_NEON
       ${AOM_DSP_COMMON_INTRIN_NEON}
@@ -120,10 +135,15 @@
       "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c"
       "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c"
       "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
+      "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c")
+
+  if (NOT CONFIG_PARALLEL_DEBLOCKING)
+    set(AOM_DSP_COMMON_INTRIN_NEON
+        ${AOM_DSP_COMMON_INTRIN_NEON}
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
+  endif ()
 endif ()
 
 set(AOM_DSP_COMMON_INTRIN_DSPR2
@@ -143,14 +163,19 @@
     "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
     "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
     "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
+    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
+
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_DSPR2
+      ${AOM_DSP_COMMON_INTRIN_DSPR2}
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
+endif ()
 
 set(AOM_DSP_COMMON_INTRIN_MSA
     "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c"
@@ -171,13 +196,18 @@
     "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c"
     "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
     "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h"
     "${AOM_ROOT}/aom_dsp/mips/macros_msa.h"
     "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h")
 
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_MSA
+      ${AOM_DSP_COMMON_INTRIN_MSA}
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h")
+endif ()
+
 if (CONFIG_HIGHBITDEPTH)
   set(AOM_DSP_COMMON_ASM_SSE2
       ${AOM_DSP_COMMON_ASM_SSE2}