SSE2 optimization for lpf 16_dual implementations

covers horizontal and vertical variations and
including low and high bitdepth types.

Appropriate tests are enabled

Performance changes, SSE2 over C:
Horizontal methods: up to  3x
Vertical   methods: up to  2x

Change-Id: If430a916394c7befa743e4fbaa9913fd37c535ed
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index f3d0aa1..afde34f 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -450,6 +450,8 @@
              8),
   make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
              &aom_highbd_lpf_horizontal_16_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
+             &aom_highbd_lpf_horizontal_16_dual_c, 8),
   make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
   make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
              8),
@@ -460,6 +462,8 @@
              10),
   make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
              &aom_highbd_lpf_horizontal_16_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
+             &aom_highbd_lpf_horizontal_16_dual_c, 10),
   make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
   make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
              10),
@@ -470,6 +474,16 @@
              12),
   make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
              &aom_highbd_lpf_horizontal_16_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
+             &aom_highbd_lpf_horizontal_16_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
+             12),
+  make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
+             &aom_highbd_lpf_vertical_16_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
+             &aom_highbd_lpf_vertical_16_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
+             &aom_highbd_lpf_vertical_16_dual_c, 12),
   make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
 };
 
@@ -482,9 +496,12 @@
   make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
   make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
   make_tuple(&aom_lpf_horizontal_16_sse2, &aom_lpf_horizontal_16_c, 8),
+  make_tuple(&aom_lpf_horizontal_16_dual_sse2, &aom_lpf_horizontal_16_dual_c,
+             8),
   make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
   make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
   make_tuple(&aom_lpf_vertical_16_sse2, &aom_lpf_vertical_16_c, 8),
+  make_tuple(&aom_lpf_vertical_16_dual_sse2, &aom_lpf_vertical_16_dual_c, 8)
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,