[CFL] SSSE3/AVX2/NEON 4:2:2 Subsampling

SSSE3/Subsample422SpeedTest
4x4: C time = 104 us, SIMD time = 48 us (~2.2x)
8x8: C time = 351 us, SIMD time = 80 us (~4.4x)
16x16: C time = 1454 us, SIMD time = 163 us (~8.9x)
32x32: C time = 5345 us, SIMD time = 680 us (~7.9x)

AVX2/Subsample422SpeedTest
32x32: C time = 5382 us, SIMD time = 782 us (~6.9x)

NEON/Subsample422SpeedTest
4x4: C time = 1131 us, SIMD time = 768 us (~1.5x)
8x8: C time = 4214 us, SIMD time = 1451 us (~2.9x)
16x16: C time = 16898 us, SIMD time = 3529 us (~4.8x)
32x32: C time = 66097 us, SIMD time = 13776 us (~4.8x)

Change-Id: Iaa0153222bd214d48d7661b20ac06d1ddbd48997
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index beaaa13..8bc8489 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -63,6 +63,31 @@
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
 }
 
+static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              int16_t *pred_buf_q3, int width,
+                                              int height) {
+  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
+    } else {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(top), 2));
+      if (width == 32) {
+        const uint16x8_t next_top = vpaddlq_u8(vld1q_u8(input + 16));
+        vst1q_s16(pred_buf_q3 + 8,
+                  vshlq_n_s16(vreinterpretq_s16_u16(next_top), 2));
+      }
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
 CFL_GET_SUBSAMPLE_FUNCTION(neon)
 
 static INLINE void subtract_average_neon(int16_t *pred_buf, int width,
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index cc2e016..9b3a83b 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -353,6 +353,9 @@
 add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
 specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
 
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
+
 add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
 specialize qw/get_predict_lbd_fn ssse3 avx2/;
 
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index ac16786..41433b0 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -335,10 +335,6 @@
   }
 }
 
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 422 SIMD
-// will be implemented
-CFL_SUBSAMPLE_FUNCTIONS(c, 422, lbd)
-
 // TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
 // will be implemented
 CFL_SUBSAMPLE_FUNCTIONS(c, 444, lbd)
@@ -357,41 +353,6 @@
 
 CFL_GET_SUBSAMPLE_FUNCTION(c)
 
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
-// will be implemented
-cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size) {
-  CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 420, hbd)
-  return subfn_420[tx_size];
-}
-
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
-// will be implemented
-cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size) {
-  CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 422, hbd)
-  return subfn_422[tx_size];
-}
-
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
-// will be implemented
-cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size) {
-  CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 444, hbd)
-  return subfn_444[tx_size];
-}
-
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 422 SIMD
-// will be implemented
-cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size) {
-  CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 422, lbd)
-  return subfn_422[tx_size];
-}
-
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
-// will be implemented
-cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size) {
-  CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 444, lbd)
-  return subfn_444[tx_size];
-}
-
 static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
                                                        int sub_x, int sub_y) {
   if (sub_x == 1) {
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index cb76b8d..2850c02 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -78,21 +78,26 @@
   }
 
 // Declare size-specific wrappers for all valid CfL sizes.
-#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \
-  CFL_SUBSAMPLE(arch, sub, bd, 4, 4)           \
-  CFL_SUBSAMPLE(arch, sub, bd, 8, 8)           \
-  CFL_SUBSAMPLE(arch, sub, bd, 16, 16)         \
-  CFL_SUBSAMPLE(arch, sub, bd, 32, 32)         \
-  CFL_SUBSAMPLE(arch, sub, bd, 4, 8)           \
-  CFL_SUBSAMPLE(arch, sub, bd, 8, 4)           \
-  CFL_SUBSAMPLE(arch, sub, bd, 8, 16)          \
-  CFL_SUBSAMPLE(arch, sub, bd, 16, 8)          \
-  CFL_SUBSAMPLE(arch, sub, bd, 16, 32)         \
-  CFL_SUBSAMPLE(arch, sub, bd, 32, 16)         \
-  CFL_SUBSAMPLE(arch, sub, bd, 4, 16)          \
-  CFL_SUBSAMPLE(arch, sub, bd, 16, 4)          \
-  CFL_SUBSAMPLE(arch, sub, bd, 8, 32)          \
-  CFL_SUBSAMPLE(arch, sub, bd, 32, 8)
+#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd)                            \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 8)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 4)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 32)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 8)                                     \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \
+      TX_SIZE tx_size) {                                                  \
+    CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                           \
+    return subfn_##sub[tx_size];                                          \
+  }
 
 // Declare an architecture-specific array of function pointers for size-specific
 // wrappers.
@@ -121,13 +126,9 @@
 
 // The RTCD script does not support passing in the an array, so we wrap it in
 // this function.
-#define CFL_GET_SUBSAMPLE_FUNCTION(arch)                        \
-  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd)                       \
-  cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_##arch( \
-      TX_SIZE tx_size) {                                        \
-    CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, 420, lbd)                \
-    return subfn_420[tx_size];                                  \
-  }
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd)
 
 // Null function used for invalid tx_sizes
 static INLINE void cfl_subtract_average_null(int16_t *pred_buf_q3) {
diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c
index 5d3a141..ae096e7 100644
--- a/av1/common/x86/cfl_avx2.c
+++ b/av1/common/x86/cfl_avx2.c
@@ -16,6 +16,36 @@
 
 #include "av1/common/x86/cfl_simd.h"
 
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                           \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                     \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                     \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                      \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(    \
+      TX_SIZE tx_size) {                                                   \
+    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {     \
+      subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
+      subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
+      subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
+      subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
+      cfl_subsample_##bd##_null,            /* 64x64 (invalid CFL size) */ \
+      subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
+      subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
+      subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
+      subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
+      subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
+      subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
+      cfl_subsample_##bd##_null,            /* 32x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_null,            /* 64x32 (invalid CFL size) */ \
+      subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
+      subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
+      subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
+      subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
+      cfl_subsample_##bd##_null,            /* 16x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_null,            /* 64x16 (invalid CFL size) */ \
+    };                                                                     \
+    return subfn_##sub[tx_size];                                           \
+  }
+
 /**
  * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
  * precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -51,35 +81,36 @@
   } while ((row += CFL_BUF_LINE_I256) < row_end);
 }
 
-CFL_SUBSAMPLE(avx2, 420, lbd, 32, 32)
-CFL_SUBSAMPLE(avx2, 420, lbd, 32, 16)
-CFL_SUBSAMPLE(avx2, 420, lbd, 32, 8)
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
 
-cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size) {
-  static const cfl_subsample_lbd_fn subfn_420[TX_SIZES_ALL] = {
-    subsample_lbd_420_4x4_ssse3,   /* 4x4 */
-    subsample_lbd_420_8x8_ssse3,   /* 8x8 */
-    subsample_lbd_420_16x16_ssse3, /* 16x16 */
-    subsample_lbd_420_32x32_avx2,  /* 32x32 */
-    cfl_subsample_lbd_null,        /* 64x64 (invalid CFL size) */
-    subsample_lbd_420_4x8_ssse3,   /* 4x8 */
-    subsample_lbd_420_8x4_ssse3,   /* 8x4 */
-    subsample_lbd_420_8x16_ssse3,  /* 8x16 */
-    subsample_lbd_420_16x8_ssse3,  /* 16x8 */
-    subsample_lbd_420_16x32_ssse3, /* 16x32 */
-    subsample_lbd_420_32x16_avx2,  /* 32x16 */
-    cfl_subsample_lbd_null,        /* 32x64 (invalid CFL size) */
-    cfl_subsample_lbd_null,        /* 64x32 (invalid CFL size) */
-    subsample_lbd_420_4x16_ssse3,  /* 4x16  */
-    subsample_lbd_420_16x4_ssse3,  /* 16x4  */
-    subsample_lbd_420_8x32_ssse3,  /* 8x32  */
-    subsample_lbd_420_32x8_avx2,   /* 32x8  */
-    cfl_subsample_lbd_null,        /* 16x64 (invalid CFL size) */
-    cfl_subsample_lbd_null,        /* 64x16 (invalid CFL size) */
-  };
-  return subfn_420[tx_size];
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              int16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                                // Forever 32
+  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+    _mm256_storeu_si256(row, top_16x16);
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
 }
 
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
 static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
                                         __m256i alpha_sign, __m256i dc_q0) {
   __m256i ac_q3 = _mm256_loadu_si256(input);
@@ -136,8 +167,8 @@
     cfl_predict_lbd_null,    /* 16x64 (invalid CFL size) */
     cfl_predict_lbd_null,    /* 64x16 (invalid CFL size) */
   };
-  /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
-              */ /* index the function pointer array out of bounds. */
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
   return pred[tx_size % TX_SIZES_ALL];
 }
 
diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h
index 058d170..88968e0 100644
--- a/av1/common/x86/cfl_simd.h
+++ b/av1/common/x86/cfl_simd.h
@@ -39,6 +39,34 @@
 void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
                                    int16_t *output_q3);
 
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 int16_t *output_q3);
+void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 int16_t *output_q3);
+void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 int16_t *output_q3);
+void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 int16_t *output_q3);
+void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   int16_t *output_q3);
+void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   int16_t *output_q3);
+
 // SSE2 version is optimal for with == 4, we reuse them in AVX2
 void subtract_average_4x4_sse2(int16_t *pred_buf_q3);
 void subtract_average_4x8_sse2(int16_t *pred_buf_q3);
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index 735b806..2e9eb35 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -78,6 +78,54 @@
   } while (pred_buf_q3 < end);
 }
 
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      int16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i fours = _mm_set1_epi8(4);
+  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  const int luma_stride = input_stride;
+  __m128i top, next_top, top_16x8, next_top_16x8;
+  do {
+    switch (width) {
+      case 4: top = _mm_cvtsi32_si128(*((int *)input)); break;
+      case 8: top = _mm_loadl_epi64((__m128i *)input); break;
+      case 16: top = _mm_loadu_si128((__m128i *)input); break;
+      case 32:
+        top = _mm_loadu_si128((__m128i *)input);
+        next_top = _mm_loadu_si128((__m128i *)(input + 16));
+        break;
+      default: assert(0);
+    }
+    top_16x8 = _mm_maddubs_epi16(top, fours);
+    if (width == 32) {
+      next_top_16x8 = _mm_maddubs_epi16(next_top, fours);
+    }
+    switch (width) {
+      case 4: *((int *)pred_buf_q3) = _mm_cvtsi128_si32(top_16x8); break;
+      case 8: _mm_storel_epi64((__m128i *)pred_buf_q3, top_16x8); break;
+      case 16: _mm_storeu_si128((__m128i *)pred_buf_q3, top_16x8); break;
+      case 32:
+        _mm_storeu_si128((__m128i *)pred_buf_q3, top_16x8);
+        _mm_storeu_si128((__m128i *)(pred_buf_q3 + 8), next_top_16x8);
+        break;
+      default: assert(0);
+    }
+    input += luma_stride;
+    pred_buf_q3 += CFL_BUF_LINE;
+  } while (pred_buf_q3 < end);
+}
+
 CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
 
 static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index dadbfdc..41b0057 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -31,6 +31,21 @@
       make_tuple(TX_16X32, &function), make_tuple(TX_32X8, &function), \
       make_tuple(TX_32X16, &function), make_tuple(TX_32X32, &function)
 
+#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422)                            \
+  make_tuple(TX_4X4, &fun420, &fun422), make_tuple(TX_4X8, &fun420, &fun422), \
+      make_tuple(TX_4X16, &fun420, &fun422),                                  \
+      make_tuple(TX_8X4, &fun420, &fun422),                                   \
+      make_tuple(TX_8X8, &fun420, &fun422),                                   \
+      make_tuple(TX_8X16, &fun420, &fun422),                                  \
+      make_tuple(TX_8X32, &fun420, &fun422),                                  \
+      make_tuple(TX_16X4, &fun420, &fun422),                                  \
+      make_tuple(TX_16X8, &fun420, &fun422),                                  \
+      make_tuple(TX_16X16, &fun420, &fun422),                                 \
+      make_tuple(TX_16X32, &fun420, &fun422),                                 \
+      make_tuple(TX_32X8, &fun420, &fun422),                                  \
+      make_tuple(TX_32X16, &fun420, &fun422),                                 \
+      make_tuple(TX_32X32, &fun420, &fun422)
+
 namespace {
 
 template <typename A>
@@ -187,7 +202,8 @@
 }
 
 typedef cfl_subsample_lbd_fn (*get_subsample_fn)(TX_SIZE tx_size);
-typedef ::testing::tuple<TX_SIZE, get_subsample_fn> subsample_param;
+typedef ::testing::tuple<TX_SIZE, get_subsample_fn, get_subsample_fn>
+    subsample_param;
 class CFLSubsampleTest : public ::testing::TestWithParam<subsample_param>,
                          public CFLTestWithData<uint8_t> {
  public:
@@ -196,12 +212,16 @@
   virtual void SetUp() {
     CFLTest::init(::testing::get<0>(this->GetParam()));
     fun_420 = ::testing::get<1>(this->GetParam())(tx_size);
+    fun_422 = ::testing::get<2>(this->GetParam())(tx_size);
     fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
+    fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
   }
 
  protected:
   cfl_subsample_lbd_fn fun_420;
+  cfl_subsample_lbd_fn fun_422;
   cfl_subsample_lbd_fn fun_420_ref;
+  cfl_subsample_lbd_fn fun_422_ref;
 
   void subsampleTest(cfl_subsample_lbd_fn fun, cfl_subsample_lbd_fn fun_ref,
                      int sub_width, int sub_height) {
@@ -251,6 +271,14 @@
   subsampleSpeedTest(fun_420, fun_420_ref);
 }
 
+TEST_P(CFLSubsampleTest, Subsample422Test) {
+  subsampleTest(fun_422, fun_422_ref, width >> 1, height);
+}
+
+TEST_P(CFLSubsampleTest, DISABLED_Subsample422SpeedTest) {
+  subsampleSpeedTest(fun_422, fun_422_ref);
+}
+
 typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
 typedef ::testing::tuple<TX_SIZE, get_predict_fn> predict_param;
 class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
@@ -360,8 +388,9 @@
 
 #if HAVE_SSSE3
 
-const subsample_param subsample_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
-    cfl_get_luma_subsampling_420_lbd_ssse3) };
+const subsample_param subsample_sizes_ssse3[] = { ALL_CFL_TX_SIZES_SUBSAMPLE(
+    cfl_get_luma_subsampling_420_lbd_ssse3,
+    cfl_get_luma_subsampling_422_lbd_ssse3) };
 
 const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
     get_predict_lbd_fn_ssse3) };
@@ -383,8 +412,9 @@
 const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
     get_subtract_average_fn_avx2) };
 
-const subsample_param subsample_sizes_avx2[] = { ALL_CFL_TX_SIZES(
-    cfl_get_luma_subsampling_420_lbd_avx2) };
+const subsample_param subsample_sizes_avx2[] = { ALL_CFL_TX_SIZES_SUBSAMPLE(
+    cfl_get_luma_subsampling_420_lbd_avx2,
+    cfl_get_luma_subsampling_422_lbd_avx2) };
 
 const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
     get_predict_lbd_fn_avx2) };
@@ -409,8 +439,9 @@
 const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
     get_subtract_average_fn_neon) };
 
-const subsample_param subsample_sizes_neon[] = { ALL_CFL_TX_SIZES(
-    cfl_get_luma_subsampling_420_lbd_neon) };
+const subsample_param subsample_sizes_neon[] = { ALL_CFL_TX_SIZES_SUBSAMPLE(
+    cfl_get_luma_subsampling_420_lbd_neon,
+    cfl_get_luma_subsampling_422_lbd_neon) };
 
 INSTANTIATE_TEST_CASE_P(NEON, CFLSubAvgTest,
                         ::testing::ValuesIn(sub_avg_sizes_neon));