Lowbd parallel_deblocking sse2 optimization

Baseline + parallel_deblocking:

- Passed unit tests *SSE2/Loop8Test6*, *AVX2/Loop8Test6*.
- 1080p, 25 frames, profile=0, encoding/decoding, output match.
- Decoder frame rate increases from 54.15 to 65.84.

Change-Id: I55938c94961066594f4b9080192c7268c19d9bf9
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4828345..b7f9b6b 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -65,12 +65,17 @@
 
 set(AOM_DSP_COMMON_INTRIN_AVX2
     "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
     "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h"
     "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h")
 
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_AVX2
+      ${AOM_DSP_COMMON_INTRIN_AVX2}
+      "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c")
+endif ()
+
 set(AOM_DSP_COMMON_ASM_NEON
     "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm"
     "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm"
@@ -85,12 +90,17 @@
     "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm"
     "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm"
     "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm"
     "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm")
 
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_ASM_NEON
+      ${AOM_DSP_COMMON_ASM_NEON}
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm")
+endif ()
+
 set(AOM_DSP_COMMON_INTRIN_NEON
     "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
@@ -98,13 +108,18 @@
     "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
     "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
 
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_NEON
+      ${AOM_DSP_COMMON_INTRIN_NEON}
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c")
+endif ()
+
 if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
   set(AOM_DSP_COMMON_INTRIN_NEON
       ${AOM_DSP_COMMON_INTRIN_NEON}
@@ -120,10 +135,15 @@
       "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c"
       "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c"
       "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
+      "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c")
+
+  if (NOT CONFIG_PARALLEL_DEBLOCKING)
+    set(AOM_DSP_COMMON_INTRIN_NEON
+        ${AOM_DSP_COMMON_INTRIN_NEON}
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
+  endif ()
 endif ()
 
 set(AOM_DSP_COMMON_INTRIN_DSPR2
@@ -143,14 +163,19 @@
     "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
     "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
     "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
+    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
+
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_DSPR2
+      ${AOM_DSP_COMMON_INTRIN_DSPR2}
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
+endif ()
 
 set(AOM_DSP_COMMON_INTRIN_MSA
     "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c"
@@ -171,13 +196,18 @@
     "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c"
     "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
     "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h"
     "${AOM_ROOT}/aom_dsp/mips/macros_msa.h"
     "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h")
 
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_MSA
+      ${AOM_DSP_COMMON_INTRIN_MSA}
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h")
+endif ()
+
 if (CONFIG_HIGHBITDEPTH)
   set(AOM_DSP_COMMON_ASM_SSE2
       ${AOM_DSP_COMMON_ASM_SSE2}
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index bf856ac..3eb0e3b 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -164,9 +164,11 @@
 DSP_SRCS-yes += loopfilter.c
 
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
-DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
 DSP_SRCS-$(HAVE_SSE2)                += x86/lpf_common_sse2.h
 
+ifneq ($(CONFIG_PARALLEL_DEBLOCKING),yes)
+DSP_SRCS-$(HAVE_AVX2)   += x86/loopfilter_avx2.c
+
 DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
@@ -192,6 +194,7 @@
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
+endif  # !CONFIG_PARALLEL_DEBLOCKING
 
 ifeq ($(CONFIG_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index f30879d..8400584 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -230,46 +230,84 @@
 # Loopfilter
 #
 add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
-$aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
+  specialize qw/aom_lpf_vertical_16 sse2/;
+} else {
+  specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
+  $aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon;
+}
 
 add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
-$aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
+  specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
+  $aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon;
+}
 
 add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
+  specialize qw/aom_lpf_vertical_8 sse2/;
+} else {
+  specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/;
+}
 
 add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
+  specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
+  $aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon;
+}
 
 add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
+  specialize qw/aom_lpf_vertical_4 sse2/;
+} else {
+  specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/;
+}
 
 add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
+  specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+}
 
 add_proto qw/void aom_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
-$aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
+  specialize qw/aom_lpf_horizontal_edge_8 sse2/;
+} else {
+  specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
+  $aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon;
+}
 
 add_proto qw/void aom_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
-$aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
+  specialize qw/aom_lpf_horizontal_edge_16 sse2/;
+} else {
+  specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
+  $aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon;
+}
 
 add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
+  specialize qw/aom_lpf_horizontal_8 sse2/;
+} else {
+  specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/;
+}
 
 add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
+  specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
+  $aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon;
+}
 
 add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
+  specialize qw/aom_lpf_horizontal_4 sse2/;
+} else {
+  specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/;
+}
 
 add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
+  specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+}
 
 if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 7e134dc..8343dbb 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -178,10 +178,20 @@
 #endif  // !CONFIG_PARALLEL_DEBLOCKING
   FILTER4;
 
+#if CONFIG_PARALLEL_DEBLOCKING
+  *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 8);
+  *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0);
+
+  *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 8);
+  *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0);
+#else
   _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
   _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
   _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
   _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
+#endif
 }
 
 void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
@@ -267,8 +277,10 @@
   x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
   // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
+#if !CONFIG_PARALLEL_DEBLOCKING
   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
   qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
+#endif
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
 
@@ -279,7 +291,7 @@
   *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
   *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-
+#if !CONFIG_PARALLEL_DEBLOCKING
   *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
   *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
@@ -287,6 +299,19 @@
   *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
   *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+#endif
+}
+
+static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num,
+                                       uint8_t *s) {
+#if CONFIG_PARALLEL_DEBLOCKING
+  *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x);
+  const __m128i hi = _mm_srli_si128(*x, 8);
+  *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi);
+#else
+  _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x);
+  _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x));
+#endif
 }
 
 void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
@@ -580,44 +605,37 @@
     q6p6 = _mm_andnot_si128(flat2, q6p6);
     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    store_buffer_horz_8(&q6p6, p, 6, s);
 
     q5p5 = _mm_andnot_si128(flat2, q5p5);
     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    store_buffer_horz_8(&q5p5, p, 5, s);
 
     q4p4 = _mm_andnot_si128(flat2, q4p4);
     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    store_buffer_horz_8(&q4p4, p, 4, s);
 
     q3p3 = _mm_andnot_si128(flat2, q3p3);
     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    store_buffer_horz_8(&q3p3, p, 3, s);
 
     q2p2 = _mm_andnot_si128(flat2, q2p2);
     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    store_buffer_horz_8(&q2p2, p, 2, s);
 
     q1p1 = _mm_andnot_si128(flat2, q1p1);
     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    store_buffer_horz_8(&q1p1, p, 1, s);
 
     q0p0 = _mm_andnot_si128(flat2, q0p0);
     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    store_buffer_horz_8(&q0p0, p, 0, s);
   }
 }
 
@@ -651,10 +669,33 @@
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
 
-void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
+typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;
+
+static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
+                                        int p, int offset, uint8_t *s) {
+  int i;
+  if (pixel_num == FOUR_PIXELS) {
+    for (i = 13; i >= 0; i--) {
+      *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]);
+    }
+  }
+  if (pixel_num == EIGHT_PIXELS) {
+    for (i = 13; i >= 0; i--) {
+      _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]);
+    }
+  }
+  if (pixel_num == SIXTEEN_PIXELS) {
+    for (i = 13; i >= 0; i--) {
+      _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]);
+    }
+  }
+}
+
+static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
+                                             unsigned char *s, int p,
+                                             const unsigned char *_blimit,
+                                             const unsigned char *_limit,
+                                             const unsigned char *_thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -910,73 +951,62 @@
       f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
 
-      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+      __m128i x[14];
+      x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
-      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+      x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
-      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+      x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
-      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+      x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
-      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+      x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
-      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
-      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
-      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
-      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+      x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
-      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+      x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
-      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+      x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
-      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+      x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
-      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+      x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
-      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+      x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
+
+      store_buffer_horz_16(pixel_num, x, p, 6, s);
     }
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1186,15 +1216,35 @@
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
+#if CONFIG_PARALLEL_DEBLOCKING
+    *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2);
+    *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1);
+    *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0);
+    *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0);
+    *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1);
+    *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2);
+#else
     _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
     _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
     _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
     _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
     _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
     _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+#endif
   }
 }
 
+void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit,
+                                     const unsigned char *_limit,
+                                     const unsigned char *_thresh) {
+#if CONFIG_PARALLEL_DEBLOCKING
+  lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh);
+#else
+  lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh);
+#endif
+}
+
 void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
                                     const uint8_t *_limit0,
                                     const uint8_t *_thresh0,
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index 62e6b81..a8ec50c 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -13,13 +13,12 @@
 
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
-#include "av1/common/av1_loopfilter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 
 #if CONFIG_LPF_DIRECT
@@ -3002,8 +3001,8 @@
                                         params.hev_thr, cm->bit_depth);
             else
 #endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_4_c(filt_start, line_length, params.mblim,
-                                   params.lim, params.hev_thr);
+              aom_lpf_vertical_4(filt_start, line_length, params.mblim,
+                                 params.lim, params.hev_thr);
             break;
           // apply 8-tap filtering
           case 8:
@@ -3014,8 +3013,8 @@
                                         params.hev_thr, cm->bit_depth);
             else
 #endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_8_c(filt_start, line_length, params.mblim,
-                                   params.lim, params.hev_thr);
+              aom_lpf_vertical_8(filt_start, line_length, params.mblim,
+                                 params.lim, params.hev_thr);
             break;
           // apply 16-tap filtering
           case 16:
@@ -3026,8 +3025,8 @@
                                          params.hev_thr, cm->bit_depth);
             else
 #endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_16_c(filt_start, line_length, params.mblim,
-                                    params.lim, params.hev_thr);
+              aom_lpf_vertical_16(filt_start, line_length, params.mblim,
+                                  params.lim, params.hev_thr);
             break;
           // no filtering
           default: break;
@@ -3060,8 +3059,8 @@
                                     params.hev_thr, cm->bit_depth);
         else
 #endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_vertical_4_c(filt_start, line_length, params.mblim,
-                               params.lim, params.hev_thr);
+          aom_lpf_vertical_4(filt_start, line_length, params.mblim, params.lim,
+                             params.hev_thr);
 
         for (i = 0; i < 128; ++i) {
           if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
@@ -3078,8 +3077,8 @@
                                       cm->bit_depth);
           else
 #endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_vertical_4_c(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
+            aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
           break;
         // apply 8-tap filtering
         case 8:
@@ -3090,8 +3089,8 @@
                                       cm->bit_depth);
           else
 #endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_vertical_8_c(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
+            aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
           break;
         // apply 16-tap filtering
         case 16:
@@ -3102,8 +3101,8 @@
                                        cm->bit_depth);
           else
 #endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_vertical_16_c(p, dst_stride, params.mblim, params.lim,
-                                  params.hev_thr);
+            aom_lpf_vertical_16(p, dst_stride, params.mblim, params.lim,
+                                params.hev_thr);
           break;
         // no filtering
         default: break;
@@ -3117,8 +3116,8 @@
                                     cm->bit_depth);
         else
 #endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_vertical_4_c(p + 4, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+          aom_lpf_vertical_4(p + 4, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
       }
 #endif  // CONFIG_LPF_DIRECT
       // advance the destination pointer
@@ -3188,8 +3187,8 @@
                                           params.hev_thr, cm->bit_depth);
             else
 #endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_4_c(filt_start, line_length, params.mblim,
-                                     params.lim, params.hev_thr);
+              aom_lpf_horizontal_4(filt_start, line_length, params.mblim,
+                                   params.lim, params.hev_thr);
             break;
           // apply 8-tap filtering
           case 8:
@@ -3200,8 +3199,8 @@
                                           params.hev_thr, cm->bit_depth);
             else
 #endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_8_c(filt_start, line_length, params.mblim,
-                                     params.lim, params.hev_thr);
+              aom_lpf_horizontal_8(filt_start, line_length, params.mblim,
+                                   params.lim, params.hev_thr);
             break;
           // apply 16-tap filtering
           case 16:
@@ -3212,9 +3211,8 @@
                   params.lim, params.hev_thr, cm->bit_depth);
             else
 #endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_edge_16_c(filt_start, line_length,
-                                           params.mblim, params.lim,
-                                           params.hev_thr);
+              aom_lpf_horizontal_edge_16(filt_start, line_length, params.mblim,
+                                         params.lim, params.hev_thr);
             break;
           // no filtering
           default: break;
@@ -3246,8 +3244,8 @@
                                       params.hev_thr, cm->bit_depth);
         else
 #endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_horizontal_4_c(filt_start, line_length, params.mblim,
-                                 params.lim, params.hev_thr);
+          aom_lpf_horizontal_4(filt_start, line_length, params.mblim,
+                               params.lim, params.hev_thr);
 
         for (i = 0; i < 256; ++i) {
           if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
@@ -3264,8 +3262,8 @@
                                         params.hev_thr, cm->bit_depth);
           else
 #endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_horizontal_4_c(p, dst_stride, params.mblim, params.lim,
-                                   params.hev_thr);
+            aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
           break;
         // apply 8-tap filtering
         case 8:
@@ -3276,8 +3274,8 @@
                                         params.hev_thr, cm->bit_depth);
           else
 #endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_horizontal_8_c(p, dst_stride, params.mblim, params.lim,
-                                   params.hev_thr);
+            aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
           break;
         // apply 16-tap filtering
         case 16:
@@ -3288,8 +3286,8 @@
                 params.hev_thr, cm->bit_depth);
           else
 #endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_horizontal_edge_16_c(p, dst_stride, params.mblim,
-                                         params.lim, params.hev_thr);
+            aom_lpf_horizontal_edge_16(p, dst_stride, params.mblim, params.lim,
+                                       params.hev_thr);
           break;
         // no filtering
         default: break;
@@ -3303,8 +3301,8 @@
                                       params.hev_thr, cm->bit_depth);
         else
 #endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_horizontal_4_c(p + 4 * dst_stride, dst_stride, params.mblim,
-                                 params.lim, params.hev_thr);
+          aom_lpf_horizontal_4(p + 4 * dst_stride, dst_stride, params.mblim,
+                               params.lim, params.hev_thr);
       }
 #endif  // CONFIG_LPF_DIRECT
       // advance the destination pointer
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 5cbd92e..4859a8e 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -550,7 +550,6 @@
 INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
                         ::testing::ValuesIn(kHbdLoop8Test6));
 #else
-#if !CONFIG_PARALLEL_DEBLOCKING
 const loop8_param_t kLoop8Test6[] = {
   make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
   make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
@@ -560,12 +559,13 @@
   make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
   make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
   make_tuple(&aom_lpf_vertical_16_sse2, &aom_lpf_vertical_16_c, 8),
+#if !CONFIG_PARALLEL_DEBLOCKING
   make_tuple(&aom_lpf_vertical_16_dual_sse2, &aom_lpf_vertical_16_dual_c, 8)
+#endif
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
                         ::testing::ValuesIn(kLoop8Test6));
-#endif
 #endif  // CONFIG_HIGHBITDEPTH
 #endif  // HAVE_SSE2