Add AVX2 variant for temporal_filter_plane_c encode time cpu used reduction 4 1.17% 3 0.95% 2 0.47% 1 0.24% Module gains improved by a factor of ~5x w.r.t C Change-Id: If8dd1c60cd9fc0b65fb17d05f5d207eaeba30e9a

commit: 2474d1576eca2867313e0572c5e41ecfef9fcb2b [log] [tgz]
author: Jayasanker J <jayasanker.j@ittiam.com> Wed Dec 04 14:44:46 2019 +0530
committer: Yunqing Wang <yunqingwang@google.com> Thu Dec 05 17:38:54 2019 +0000
tree: 8109c008966d66e5f879d49015b30df10083b495
parent: d71ace6342928a53e820248e082a0b55336c8d0d [diff]
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 16d1e34..66ee33b 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake

@@ -345,6 +345,7 @@
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
 
 if(NOT CONFIG_AV1_HIGHBITDEPTH)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3dd0423..a00975f 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -281,6 +281,10 @@
     specialize qw/av1_apply_temporal_filter sse4_1/;
   }
 
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_temporal_filter_plane/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int stride2, int block_width, int block_height, int strength, double sigma, int decay_control, const int *blk_fw, int use_32x32, unsigned int *accumulator, uint16_t *count";
+    specialize qw/av1_temporal_filter_plane avx2/;
+  }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
   # ENCODEMB INVOKE

diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index f8a318c1..493c621 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c

@@ -36,11 +36,6 @@
 #include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
 
-#define EXPERIMENT_TEMPORAL_FILTER 1
-#define WINDOW_LENGTH 2
-#define WINDOW_SIZE 25
-#define SCALE 1000
-
 static unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
                                        39322, 32768, 28087, 24576, 21846,
                                        19661, 17874, 0,     15124 };
@@ -849,17 +844,17 @@
     } else {
       decay_control = 3;
     }
-    av1_temporal_filter_plane_c(frame->y_buffer + mb_y_src_offset,
-                                frame->y_stride, predictor, BW, BW, BH,
-                                strength, sigma, decay_control, blk_fw,
-                                use_32x32, accumulator, count);
+    av1_temporal_filter_plane(frame->y_buffer + mb_y_src_offset,
+                              frame->y_stride, predictor, BW, BW, BH, strength,
+                              sigma, decay_control, blk_fw, use_32x32,
+                              accumulator, count);
     if (num_planes > 1) {
-      av1_temporal_filter_plane_c(
+      av1_temporal_filter_plane(
           frame->u_buffer + mb_uv_src_offset, frame->uv_stride,
           predictor + BLK_PELS, mb_uv_width, mb_uv_width, mb_uv_height,
           strength, sigma, decay_control, blk_fw, use_32x32,
           accumulator + BLK_PELS, count + BLK_PELS);
-      av1_temporal_filter_plane_c(
+      av1_temporal_filter_plane(
           frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
           predictor + (BLK_PELS << 1), mb_uv_width, mb_uv_width, mb_uv_height,
           strength, sigma, decay_control, blk_fw, use_32x32,

diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 2df2666..a8982f5 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h

@@ -34,6 +34,11 @@
 #define EDGE_THRESHOLD 50
 #define SQRT_PI_BY_2 1.25331413732
 
+#define EXPERIMENT_TEMPORAL_FILTER 1
+#define WINDOW_LENGTH 2
+#define WINDOW_SIZE 25
+#define SCALE 1000
+
 static INLINE BLOCK_SIZE dims_to_size(int w, int h) {
   if (w != h) return -1;
   switch (w) {

diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
new file mode 100644
index 0000000..3838039
--- /dev/null
+++ b/av1/encoder/x86/temporal_filter_avx2.c

@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 2)
+
+#if EXPERIMENT_TEMPORAL_FILTER
+DECLARE_ALIGNED(32, const uint32_t, sse_bytemask[4][8]) = {
+  { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000 },
+  { 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000 },
+  { 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000 },
+  { 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }
+};
+
+DECLARE_ALIGNED(32, const uint8_t, shufflemask_16b[2][16]) = {
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
+};
+
+AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+    uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int stride2,
+    int block_width, int block_height, uint16_t *frame_sse,
+    unsigned int sse_stride) {
+  (void)block_width;
+  uint8_t *src1 = frame1;
+  uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+  for (int i = 0; i < block_height; i++) {
+    __m128i vf1_128, vf2_128;
+    __m256i vf1, vf2, vdiff1, vsqdiff1;
+
+    vf1_128 = _mm_loadu_si128((__m128i *)(src1));
+    vf2_128 = _mm_loadu_si128((__m128i *)(src2));
+    vf1 = _mm256_cvtepu8_epi16(vf1_128);
+    vf2 = _mm256_cvtepu8_epi16(vf2_128);
+    vdiff1 = _mm256_sub_epi16(vf1, vf2);
+    vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+
+    _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
+    // Set zero to unitialized memory to avoid uninitialized loads later
+    *(uint32_t *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride, src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+    uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int stride2,
+    int block_width, int block_height, uint16_t *frame_sse,
+    unsigned int sse_stride) {
+  (void)block_width;
+  uint8_t *src1 = frame1;
+  uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+  for (int i = 0; i < block_height; i++) {
+    __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
+
+    vsrc1 = _mm256_loadu_si256((__m256i *)src1);
+    vsrc2 = _mm256_loadu_si256((__m256i *)src2);
+    vmax = _mm256_max_epu8(vsrc1, vsrc2);
+    vmin = _mm256_min_epu8(vsrc1, vsrc2);
+    vdiff = _mm256_subs_epu8(vmax, vmin);
+
+    __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
+    __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
+    vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
+    vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
+
+    vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+    vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
+    _mm256_storeu_si256((__m256i *)(dst), vres1);
+    _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
+    // Set zero to unitialized memory to avoid uninitialized loads later
+    *(uint32_t *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride;
+    src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col,
+                                         int block_width) {
+  __m128i v128tmp = _mm_loadu_si128((__m128i *)(src));
+  if (col == 0) {
+    // For the first column, replicate the first element twice to the left
+    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]);
+  }
+  if (col == block_width - 4) {
+    // For the last column, replicate the last element twice to the right
+    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]);
+  }
+  return _mm256_cvtepi16_epi32(v128tmp);
+}
+
+AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+  // Mask the required 5 values inside the vector
+  __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+  __m128i v128a, v128b;
+  // Extract 256b as two 128b registers A and B
+  v128a = _mm256_castsi256_si128(vtmp);
+  v128b = _mm256_extracti128_si256(vtmp, 1);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A2+B2, A3+B3, 0, 0]
+  v128b = _mm_srli_si128(v128a, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  v128b = _mm_srli_si128(v128a, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  return _mm_extract_epi32(v128a, 0);
+}
+
+void av1_temporal_filter_plane_avx2(uint8_t *frame1, unsigned int stride,
+                                    uint8_t *frame2, unsigned int stride2,
+                                    int block_width, int block_height,
+                                    int strength, double sigma,
+                                    int decay_control, const int *blk_fw,
+                                    int use_32x32, unsigned int *accumulator,
+                                    uint16_t *count) {
+  (void)strength;
+  (void)blk_fw;
+  (void)use_32x32;
+  const double decay = decay_control * exp(1 - sigma);
+  const double h = decay * sigma;
+  const double beta = 1.0;
+
+  uint16_t frame_sse[SSE_STRIDE * BH];
+  uint32_t acc_5x5_sse[BH][BW];
+
+  assert(((block_width == 32) && (block_height == 32)) ||
+         ((block_width == 16) && (block_height == 16)));
+
+  if (block_width == 32) {
+    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  } else {
+    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  }
+
+  __m256i vsrc[5];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = xx_load_and_pad(src, col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Copy first row to first 2 vectors
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (int row = 0; row < block_height; row++) {
+      __m256i vsum = _mm256_setzero_si256();
+
+      // Add 5 consecutive rows
+      for (int i = 0; i < 5; i++) {
+        vsum = _mm256_add_epi32(vsum, vsrc[i]);
+      }
+
+      // Push all elements by one element to the top
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      // Load next row to the last element
+      if (row <= block_width - 4) {
+        vsrc[4] = xx_load_and_pad(src, col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        vsrc[4] = vsrc[3];
+      }
+
+      // Accumulate the sum horizontally
+      for (int i = 0; i < 4; i++) {
+        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i);
+      }
+    }
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+
+      int diff_sse = acc_5x5_sse[i][j];
+      diff_sse /= WINDOW_SIZE;
+
+      double scaled_diff = -diff_sse / (2 * beta * h * h);
+      // clamp the value to avoid underflow in exp()
+      if (scaled_diff < -15) scaled_diff = -15;
+      double w = exp(scaled_diff);
+      const int weight = (int)(w * SCALE);
+
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
+    }
+  }
+}
+#endif

diff --git a/test/temporal_filter_plane_test.cc b/test/temporal_filter_plane_test.cc
new file mode 100644
index 0000000..1d7cd1e
--- /dev/null
+++ b/test/temporal_filter_plane_test.cc

@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if !CONFIG_REALTIME_ONLY
+namespace {
+
+typedef void (*temporal_filter_plane_func)(
+    uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int stride2,
+    int block_width, int block_height, int strength, double sigma,
+    int decay_control, const int *blk_fw, int use_32x32,
+    unsigned int *accumulator, uint16_t *count);
+typedef libaom_test::FuncParam<temporal_filter_plane_func>
+    TestTemporal_FilterPlane;
+
+typedef ::testing::tuple<TestTemporal_FilterPlane, int> TemporalFilter_Params;
+
+class TemporalFilterTest
+    : public ::testing::TestWithParam<TemporalFilter_Params> {
+ public:
+  virtual ~TemporalFilterTest() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src1_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
+    src2_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
+
+    ASSERT_TRUE(src1_ != NULL);
+    ASSERT_TRUE(src2_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src1_);
+    aom_free(src2_);
+  }
+  void RunTest(int isRandom, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride, int stride2) {
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src1_[ii * stride + jj] = rnd_.Rand8();
+        src2_[ii * stride2 + jj] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, uint8_t *data,
+                      int stride2, uint8_t *data2, uint8_t val) {
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        data[ii * stride + jj] = val;
+        data2[ii * stride2 + jj] = (255 - val);
+      }
+    }
+  }
+
+ protected:
+  TestTemporal_FilterPlane params_;
+  uint8_t *src1_;
+  uint8_t *src2_;
+  ACMRandom rnd_;
+};
+
+void TemporalFilterTest::RunTest(int isRandom, int width, int height,
+                                 int run_times) {
+  aom_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    int stride = 5 << rnd_(6);  // Up to 256 stride
+    int stride2 = 5 << rnd_(6);
+
+    while (stride < width) {  // Make sure it's valid
+      stride = 5 << rnd_(6);
+      stride2 = 5 << rnd_(6);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride, stride2);
+    } else {
+      const int msb = 8;  // Up to 12 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src1_, stride2, src2_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src1_, stride2, src2_, 0);
+      }
+    }
+    int use32X32 = 1;
+    int strength = rnd_(16);
+    double sigma = 2.1002103677063437;
+    int decay_control = 5;
+    int blk_fw = rnd_(16);
+    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+    params_.ref_func(src1_, stride, src2_, stride2, width, height, strength,
+                     sigma, decay_control, &blk_fw, use32X32, accumulator_ref,
+                     count_ref);
+    params_.tst_func(src1_, stride, src2_, stride2, width, height, strength,
+                     sigma, decay_control, &blk_fw, use32X32, accumulator_mod,
+                     count_mod);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(src1_, stride, src2_, stride2, width, height, strength,
+                         sigma, decay_control, &blk_fw, use32X32,
+                         accumulator_ref, count_ref);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(src1_, stride, src2_, stride2, width, height, strength,
+                         sigma, decay_control, &blk_fw, use32X32,
+                         accumulator_mod, count_mod);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d \n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height);
+
+    } else {
+      for (int i = 0, l = 0; i < height; i++) {
+        for (int j = 0; j < width; j++, l++) {
+          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] C accumulator does not match optimized accumulator.";
+          EXPECT_EQ(count_ref[l], count_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] C count does not match optimized count.";
+        }
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterTest, OperationCheck) {
+  for (int height = 16; height <= 32; height = height * 2) {
+    RunTest(1, height, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(TemporalFilterTest, ExtremeValues) {
+  for (int height = 16; height <= 32; height = height * 2) {
+    RunTest(0, height, height, 1);
+  }
+}
+
+TEST_P(TemporalFilterTest, DISABLED_Speed) {
+  for (int height = 16; height <= 32; height = height * 2) {
+    RunTest(1, height, height, 100000);
+  }
+}
+
+#if HAVE_AVX2
+TestTemporal_FilterPlane Temporal_filter_test[] = { TestTemporal_FilterPlane(
+    &av1_temporal_filter_plane_c, &av1_temporal_filter_plane_avx2) };
+INSTANTIATE_TEST_CASE_P(AVX2, TemporalFilterTest,
+                        Combine(ValuesIn(Temporal_filter_test),
+                                Range(64, 65, 4)));
+#endif  // HAVE_AVX2
+}  // namespace
+#endif

diff --git a/test/test.cmake b/test/test.cmake
index 9b2ad47..eb3f528 100644
--- a/test/test.cmake
+++ b/test/test.cmake

@@ -130,6 +130,7 @@
                 "${AOM_ROOT}/test/segment_binarization_sync.cc"
                 "${AOM_ROOT}/test/superframe_test.cc"
                 "${AOM_ROOT}/test/tile_independence_test.cc"
+                "${AOM_ROOT}/test/temporal_filter_plane_test.cc"
                 "${AOM_ROOT}/test/yuv_temporal_filter_test.cc")
     if(CONFIG_REALTIME_ONLY)
       list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
commit	2474d1576eca2867313e0572c5e41ecfef9fcb2b	[log] [tgz]
author	Jayasanker J <jayasanker.j@ittiam.com>	Wed Dec 04 14:44:46 2019 +0530
committer	Yunqing Wang <yunqingwang@google.com>	Thu Dec 05 17:38:54 2019 +0000
tree	8109c008966d66e5f879d49015b30df10083b495
parent	d71ace6342928a53e820248e082a0b55336c8d0d [diff]