Use dual functions for loop filter
Use dual SIMD functions for loop filtering, including
luma 13-tap, 7-tap, 4-tap for vertical and horizontal direction
Chroma 5-tap do not have dual SIMD function yet.
Change-Id: I3afdaab240613baffcd8c19d824bfb048ed64d8f
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 140a535..fa01356 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -379,7 +379,7 @@
add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_vertical_14 sse2/;
-add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/aom_lpf_vertical_14_dual sse2/;
add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
@@ -398,7 +398,7 @@
add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_horizontal_14 sse2/;
-add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/aom_lpf_horizontal_14_dual sse2/;
add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
@@ -417,7 +417,7 @@
add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_vertical_14 sse2/;
-add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
@@ -438,7 +438,7 @@
add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
-add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index 4901e73..5b98f699 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -383,9 +383,12 @@
mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
}
-void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+ mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
}
static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
@@ -414,9 +417,12 @@
mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
}
-void aom_lpf_vertical_14_dual_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+ mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
}
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
@@ -831,11 +837,12 @@
highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
}
-void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+void aom_highbd_lpf_horizontal_14_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
+ highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
}
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
@@ -874,9 +881,11 @@
highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
}
-void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+void aom_highbd_lpf_vertical_14_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
+ highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ 4, bd);
}
diff --git a/aom_dsp/x86/highbd_loopfilter_avx2.c b/aom_dsp/x86/highbd_loopfilter_avx2.c
index 7e91369f..7219eb7 100644
--- a/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ b/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -16,17 +16,20 @@
#include "aom_dsp/x86/lpf_common_sse2.h"
#include "aom/aom_integer.h"
-void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int p,
- const uint8_t *blt,
- const uint8_t *lt,
- const uint8_t *thr, int bd) {
- aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1, bd);
}
-void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int p,
- const uint8_t *blt, const uint8_t *lt,
- const uint8_t *thr, int bd) {
- aom_highbd_lpf_vertical_14_dual_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
}
void aom_highbd_lpf_horizontal_4_dual_avx2(
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index 8af993d..f7f99ad 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -224,7 +224,6 @@
static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
__m128i *qs, const __m128i *mask,
const __m128i *th, int bd,
-
__m128i *t80) {
__m128i ps0 = _mm_subs_epi16(p[0], *t80);
__m128i ps1 = _mm_subs_epi16(p[1], *t80);
@@ -471,10 +470,12 @@
}
static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
- __m128i *p, __m128i *q, const uint8_t *blt, const uint8_t *lt,
- const uint8_t *thr, int bd) {
+ __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+ const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+ const uint8_t *thr1, int bd) {
__m128i blimit, limit, thresh, t80;
- get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+ get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+ &t80);
__m128i mask;
highbd_filter_mask(p, q, &limit, &blimit, &mask);
__m128i flat, flat2;
@@ -629,15 +630,16 @@
}
}
-void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
__m128i p[7], q[7];
int i;
load_highbd_pixel(s, 7, pitch, p, q);
- highbd_lpf_internal_14_dual_sse2(p, q, _blimit, _limit, _thresh, bd);
+ highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+ _limit1, _thresh1, bd);
for (i = 0; i < 6; i++) {
_mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
_mm_store_si128((__m128i *)(s + i * pitch), q[i]);
@@ -1461,10 +1463,10 @@
_mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
}
-void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int bd) {
+void aom_highbd_lpf_vertical_14_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
__m128i q[7], p[7];
__m128i p6, p5, p4, p3, p2, p1, p0, q0;
__m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
@@ -1496,7 +1498,8 @@
&q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
&q[6], &d7);
- highbd_lpf_internal_14_dual_sse2(p, q, blimit, limit, thresh, bd);
+ highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
&d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 660ccd8..b9c3b2b 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -898,12 +898,16 @@
xx_storel_32(s - 3 * p, p2);
xx_storel_32(s + 2 * p, q2);
}
+
void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- aom_lpf_horizontal_14_sse2(s, p, _blimit, _limit, _thresh);
- aom_lpf_horizontal_14_sse2(s + 4, p, _blimit, _limit, _thresh);
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0,
+ const unsigned char *_blimit1,
+ const unsigned char *_limit1,
+ const unsigned char *_thresh1) {
+ aom_lpf_horizontal_14_sse2(s, p, _blimit0, _limit0, _thresh0);
+ aom_lpf_horizontal_14_sse2(s + 4, p, _blimit1, _limit1, _thresh1);
}
void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
@@ -1739,15 +1743,17 @@
_mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(d2d3_2, 8));
}
-void aom_lpf_vertical_14_dual_sse2(unsigned char *s, int p,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
+void aom_lpf_vertical_14_dual_sse2(
+ unsigned char *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
// Transpose 16x16
transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
// Loop filtering
- aom_lpf_horizontal_14_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+ aom_lpf_horizontal_14_dual_sse2(t_dst + 8 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
// Transpose back
transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index fb07202..b516ef1 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -521,7 +521,6 @@
MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
const MB_MODE_INFO *const mbmi = mi[0];
const int curr_skip = mbmi->skip && is_inter_block(mbmi);
- // FIXME(chengchen): which bsize is correct?
const BLOCK_SIZE bsize = mbmi->sb_type;
const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
@@ -935,13 +934,18 @@
LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
if ((mask_16x16_0 & mask_16x16_1) & 1) {
- /*
- aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr);
- */
- lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
+ if (plane) {
+ // TODO(any): add aom_lpf_vertical_6_dual for chroma plane.
+ aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+ aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ } else {
+ // TODO(any): add dual function simd function. Current sse2 code
+ // just called aom_lpf_vertical_14_sse2 twice.
+ aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
} else if (mask_16x16_0 & 1) {
lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
} else {
@@ -956,15 +960,15 @@
LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
if ((mask_8x8_0 & mask_8x8_1) & 1) {
- // TODO(chengchen): add aom_lpf_vertical_6_dual for chroma plane.
- /*
- aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- */
- lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
+ if (plane) {
+ aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+ aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ } else {
+ aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
} else if (mask_8x8_0 & 1) {
lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
} else {
@@ -975,14 +979,9 @@
if ((mask_4x4_0 | mask_4x4_1) & 1) {
if ((mask_4x4_0 & mask_4x4_1) & 1) {
- /*
aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim, lfi1->lim,
lfi1->hev_thr);
- */
- aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
} else if (mask_4x4_0 & 1) {
aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
} else {
@@ -1026,15 +1025,16 @@
plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
if ((mask_16x16_0 & mask_16x16_1) & 1) {
- // TODO(chengchen): use dual function
- /*
- aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, bd);
- */
- highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
- bd);
- highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr, bd);
+ if (plane) {
+ aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, bd);
+ aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
} else if (mask_16x16_0 & 1) {
highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
bd);
@@ -1049,15 +1049,16 @@
plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
if ((mask_8x8_0 & mask_8x8_1) & 1) {
- /*
- aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr, bd);
- */
- highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
- bd);
- highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr, bd);
+ if (plane) {
+ aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, bd);
+ aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
} else if (mask_8x8_0 & 1) {
highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
bd);
@@ -1069,15 +1070,9 @@
if ((mask_4x4_0 | mask_4x4_1) & 1) {
if ((mask_4x4_0 & mask_4x4_1) & 1) {
- /*
aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim, lfi1->lim,
lfi1->hev_thr, bd);
- */
- aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, bd);
- aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
} else if (mask_4x4_0 & 1) {
aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, bd);
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 9a4c2e0..198bd1f 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -452,8 +452,6 @@
8),
make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
&aom_highbd_lpf_horizontal_14_c, 8),
- make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
- &aom_highbd_lpf_horizontal_14_dual_c, 8),
make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
@@ -468,8 +466,6 @@
10),
make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
&aom_highbd_lpf_horizontal_14_c, 10),
- make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
- &aom_highbd_lpf_horizontal_14_dual_c, 10),
make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
@@ -483,16 +479,8 @@
12),
make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
&aom_highbd_lpf_horizontal_14_c, 12),
- make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
- &aom_highbd_lpf_horizontal_14_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
12),
- make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
- &aom_highbd_lpf_vertical_14_dual_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
- &aom_highbd_lpf_vertical_14_dual_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
- &aom_highbd_lpf_vertical_14_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
};
@@ -506,17 +494,23 @@
make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
make_tuple(&aom_lpf_horizontal_14_sse2, &aom_lpf_horizontal_14_c, 8),
- make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
- 8),
make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
- make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
};
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,
::testing::ValuesIn(kLoop8Test6));
+const dual_loop_param_t kLoop8Test9[] = {
+ make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
+ 8),
+ make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_lbd,
+ ::testing::ValuesIn(kLoop8Test9));
+
#endif // HAVE_SSE2
#if HAVE_SSE2
@@ -525,26 +519,38 @@
&aom_highbd_lpf_horizontal_4_dual_c, 8),
make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
&aom_highbd_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+ &aom_highbd_lpf_horizontal_14_dual_c, 8),
make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
&aom_highbd_lpf_vertical_4_dual_c, 8),
make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
&aom_highbd_lpf_vertical_8_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+ &aom_highbd_lpf_vertical_14_dual_c, 8),
make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
&aom_highbd_lpf_horizontal_4_dual_c, 10),
make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
&aom_highbd_lpf_horizontal_8_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+ &aom_highbd_lpf_horizontal_14_dual_c, 10),
make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
&aom_highbd_lpf_vertical_4_dual_c, 10),
make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
&aom_highbd_lpf_vertical_8_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+ &aom_highbd_lpf_vertical_14_dual_c, 10),
make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
&aom_highbd_lpf_horizontal_4_dual_c, 12),
make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
&aom_highbd_lpf_horizontal_8_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+ &aom_highbd_lpf_horizontal_14_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
&aom_highbd_lpf_vertical_4_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
- &aom_highbd_lpf_vertical_8_dual_c, 12)
+ &aom_highbd_lpf_vertical_8_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+ &aom_highbd_lpf_vertical_14_dual_c, 12),
};
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_hbd,