Use dual functions for loop filter

Use dual SIMD functions for loop filtering, including
luma 13-tap, 7-tap, 4-tap for vertical and horizontal direction

Chroma 5-tap do not have dual SIMD function yet.

Change-Id: I3afdaab240613baffcd8c19d824bfb048ed64d8f
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 140a535..fa01356 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -379,7 +379,7 @@
 add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_vertical_14 sse2/;
 
-add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_vertical_14_dual sse2/;
 
 add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
@@ -398,7 +398,7 @@
 add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_horizontal_14 sse2/;
 
-add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_horizontal_14_dual sse2/;
 
 add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
@@ -417,7 +417,7 @@
 add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
 specialize qw/aom_highbd_lpf_vertical_14 sse2/;
 
-add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh,   int bd";
+add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
 specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
 
 add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
@@ -438,7 +438,7 @@
 add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
 specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
 specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
 
 add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index 4901e73..5b98f699 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -383,9 +383,12 @@
   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
 }
 
-void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                  const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
 }
 
 static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
@@ -414,9 +417,12 @@
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
 }
 
-void aom_lpf_vertical_14_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0,
+                                const uint8_t *blimit1, const uint8_t *limit1,
+                                const uint8_t *thresh1) {
+  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
 }
 
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
@@ -831,11 +837,12 @@
   highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
 }
 
-void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int p,
-                                         const uint8_t *blimit,
-                                         const uint8_t *limit,
-                                         const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+void aom_highbd_lpf_horizontal_14_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
+  highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
 }
 
 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
@@ -874,9 +881,11 @@
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
 }
 
-void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int p,
-                                       const uint8_t *blimit,
-                                       const uint8_t *limit,
-                                       const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+void aom_highbd_lpf_vertical_14_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
+  highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                4, bd);
 }
diff --git a/aom_dsp/x86/highbd_loopfilter_avx2.c b/aom_dsp/x86/highbd_loopfilter_avx2.c
index 7e91369f..7219eb7 100644
--- a/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ b/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -16,17 +16,20 @@
 #include "aom_dsp/x86/lpf_common_sse2.h"
 #include "aom/aom_integer.h"
 
-void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int p,
-                                            const uint8_t *blt,
-                                            const uint8_t *lt,
-                                            const uint8_t *thr, int bd) {
-  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+                                         blimit1, limit1, thresh1, bd);
 }
 
-void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int p,
-                                          const uint8_t *blt, const uint8_t *lt,
-                                          const uint8_t *thr, int bd) {
-  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                       limit1, thresh1, bd);
 }
 
 void aom_highbd_lpf_horizontal_4_dual_avx2(
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index 8af993d..f7f99ad 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -224,7 +224,6 @@
 static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
                                             __m128i *qs, const __m128i *mask,
                                             const __m128i *th, int bd,
-
                                             __m128i *t80) {
   __m128i ps0 = _mm_subs_epi16(p[0], *t80);
   __m128i ps1 = _mm_subs_epi16(p[1], *t80);
@@ -471,10 +470,12 @@
 }
 
 static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
-    __m128i *p, __m128i *q, const uint8_t *blt, const uint8_t *lt,
-    const uint8_t *thr, int bd) {
+    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+    const uint8_t *thr1, int bd) {
   __m128i blimit, limit, thresh, t80;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+                 &t80);
   __m128i mask;
   highbd_filter_mask(p, q, &limit, &blimit, &mask);
   __m128i flat, flat2;
@@ -629,15 +630,16 @@
   }
 }
 
-void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch,
-                                            const uint8_t *_blimit,
-                                            const uint8_t *_limit,
-                                            const uint8_t *_thresh, int bd) {
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
   __m128i p[7], q[7];
   int i;
   load_highbd_pixel(s, 7, pitch, p, q);
 
-  highbd_lpf_internal_14_dual_sse2(p, q, _blimit, _limit, _thresh, bd);
+  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+                                   _limit1, _thresh1, bd);
   for (i = 0; i < 6; i++) {
     _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
     _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
@@ -1461,10 +1463,10 @@
   _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
 }
 
-void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd) {
+void aom_highbd_lpf_vertical_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
   __m128i q[7], p[7];
   __m128i p6, p5, p4, p3, p2, p1, p0, q0;
   __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
@@ -1496,7 +1498,8 @@
                            &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
                            &q[6], &d7);
 
-  highbd_lpf_internal_14_dual_sse2(p, q, blimit, limit, thresh, bd);
+  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+                                   limit1, thresh1, bd);
 
   highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
                            &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 660ccd8..b9c3b2b 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -898,12 +898,16 @@
   xx_storel_32(s - 3 * p, p2);
   xx_storel_32(s + 2 * p, q2);
 }
+
 void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-  aom_lpf_horizontal_14_sse2(s, p, _blimit, _limit, _thresh);
-  aom_lpf_horizontal_14_sse2(s + 4, p, _blimit, _limit, _thresh);
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0,
+                                     const unsigned char *_blimit1,
+                                     const unsigned char *_limit1,
+                                     const unsigned char *_thresh1) {
+  aom_lpf_horizontal_14_sse2(s, p, _blimit0, _limit0, _thresh0);
+  aom_lpf_horizontal_14_sse2(s + 4, p, _blimit1, _limit1, _thresh1);
 }
 
 void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
@@ -1739,15 +1743,17 @@
   _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(d2d3_2, 8));
 }
 
-void aom_lpf_vertical_14_dual_sse2(unsigned char *s, int p,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh) {
+void aom_lpf_vertical_14_dual_sse2(
+    unsigned char *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
   // Transpose 16x16
   transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
   // Loop filtering
-  aom_lpf_horizontal_14_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+  aom_lpf_horizontal_14_dual_sse2(t_dst + 8 * 16, 16, blimit0, limit0, thresh0,
+                                  blimit1, limit1, thresh1);
   // Transpose back
   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
   transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index fb07202..b516ef1 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -521,7 +521,6 @@
     MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
     const MB_MODE_INFO *const mbmi = mi[0];
     const int curr_skip = mbmi->skip && is_inter_block(mbmi);
-    // FIXME(chengchen): which bsize is correct?
     const BLOCK_SIZE bsize = mbmi->sb_type;
     const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
     const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
@@ -935,13 +934,18 @@
         LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
 
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          /*
-          aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                   lfi0->hev_thr);
-          */
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
+          if (plane) {
+            // TODO(any): add aom_lpf_vertical_6_dual for chroma plane.
+            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                               lfi1->hev_thr);
+          } else {
+            // TODO(any): add dual function simd function. Current sse2 code
+            // just called aom_lpf_vertical_14_sse2 twice.
+            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                     lfi1->hev_thr);
+          }
         } else if (mask_16x16_0 & 1) {
           lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
@@ -956,15 +960,15 @@
         LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
 
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          // TODO(chengchen): add aom_lpf_vertical_6_dual for chroma plane.
-          /*
-          aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-          */
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
+          if (plane) {
+            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                               lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          }
         } else if (mask_8x8_0 & 1) {
           lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
@@ -975,14 +979,9 @@
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          /*
           aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
-          */
-          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
           aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
@@ -1026,15 +1025,16 @@
             plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
 
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          // TODO(chengchen): use dual function
-          /*
-          aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, bd);
-          */
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
+          if (plane) {
+            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr, bd);
+            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
+                                      lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                            lfi0->hev_thr, lfi1->mblim,
+                                            lfi1->lim, lfi1->hev_thr, bd);
+          }
         } else if (mask_16x16_0 & 1) {
           highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                               bd);
@@ -1049,15 +1049,16 @@
             plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
 
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          /*
-          aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-          */
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
+          if (plane) {
+            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr, bd);
+            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
+                                      lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          }
         } else if (mask_8x8_0 & 1) {
           highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                               bd);
@@ -1069,15 +1070,9 @@
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          /*
           aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
-          */
-          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
-          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
         } else if (mask_4x4_0 & 1) {
           aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, bd);
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 9a4c2e0..198bd1f 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -452,8 +452,6 @@
              8),
   make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
              &aom_highbd_lpf_horizontal_14_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
-             &aom_highbd_lpf_horizontal_14_dual_c, 8),
   make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8),
   make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
 
@@ -468,8 +466,6 @@
              10),
   make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
              &aom_highbd_lpf_horizontal_14_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
-             &aom_highbd_lpf_horizontal_14_dual_c, 10),
   make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10),
   make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
   make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
@@ -483,16 +479,8 @@
              12),
   make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
              &aom_highbd_lpf_horizontal_14_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
-             &aom_highbd_lpf_horizontal_14_dual_c, 12),
   make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
              12),
-  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
-             &aom_highbd_lpf_vertical_14_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
-             &aom_highbd_lpf_vertical_14_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
-             &aom_highbd_lpf_vertical_14_dual_c, 12),
   make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12),
   make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
 };
@@ -506,17 +494,23 @@
   make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
   make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
   make_tuple(&aom_lpf_horizontal_14_sse2, &aom_lpf_horizontal_14_c, 8),
-  make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
-             8),
   make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
   make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
   make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
-  make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,
                         ::testing::ValuesIn(kLoop8Test6));
 
+const dual_loop_param_t kLoop8Test9[] = {
+  make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
+             8),
+  make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_lbd,
+                        ::testing::ValuesIn(kLoop8Test9));
+
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE2
@@ -525,26 +519,38 @@
              &aom_highbd_lpf_horizontal_4_dual_c, 8),
   make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
              &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 8),
   make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
              &aom_highbd_lpf_vertical_4_dual_c, 8),
   make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
              &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 8),
   make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
              &aom_highbd_lpf_horizontal_4_dual_c, 10),
   make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
              &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 10),
   make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
              &aom_highbd_lpf_vertical_4_dual_c, 10),
   make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
              &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 10),
   make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
              &aom_highbd_lpf_horizontal_4_dual_c, 12),
   make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
              &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 12),
   make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
              &aom_highbd_lpf_vertical_4_dual_c, 12),
   make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
-             &aom_highbd_lpf_vertical_8_dual_c, 12)
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 12),
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_hbd,