[CFL] VSX version of Subtract Average

Includes unit tests for conformance and speed.

VSX/SubAvgSpeedTest (POWER8 Model 2.1)
8x8: C time = 1629 us, SIMD time = 693 us (~2.4x)
16x16: C time = 4231 us, SIMD time = 1883 us (~2.2x)
32x32: C time = 21329 us, SIMD time = 6916 us (~3.1x)

Change-Id: Idd6ba6dbfc9941f2d0acbb0a0010b3a45fa63e46
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 2ee0f8b..c17dd9f 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -423,6 +423,10 @@
     ${AOM_AV1_COMMON_INTRIN_NEON}
     "${AOM_ROOT}/av1/common/arm/cfl_neon.c")
 
+set(AOM_AV1_COMMON_INTRIN_VSX
+    ${AOM_AV1_COMMON_INTRIN_VSX}
+    "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
+
 set(AOM_AV1_COMMON_SOURCES
     ${AOM_AV1_COMMON_SOURCES}
     "${AOM_ROOT}/av1/common/restoration.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 42c1869..e0ca707 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -346,7 +346,7 @@
 
 # CFL
 add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
-specialize qw/get_subtract_average_fn sse2 avx2 neon/;
+specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/;
 
 add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
 specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
diff --git a/av1/common/ppc/cfl_ppc.c b/av1/common/ppc/cfl_ppc.c
new file mode 100644
index 0000000..70cad05
--- /dev/null
+++ b/av1/common/ppc/cfl_ppc.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <altivec.h>
+
+#include "./av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#define OFF_0 0
+#define OFF_1 16
+#define OFF_2 32
+#define OFF_3 48
+#define CFL_BUF_LINE_BYTES 64
+#define CFL_LINE_1 64
+#define CFL_LINE_2 128
+#define CFL_LINE_3 192
+
+typedef vector int8_t int8x16_t;
+typedef vector uint8_t uint8x16_t;
+typedef vector int16_t int16x8_t;
+typedef vector uint16_t uint16x8_t;
+typedef vector int32_t int32x4_t;
+typedef vector uint32_t uint32x4_t;
+typedef vector uint64_t uint64x2_t;
+
+static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
+                                        int height, int round_offset,
+                                        int num_pel_log2) {
+  const int16_t *end = pred_buf + height * CFL_BUF_LINE;
+  const int16_t *sum_buf = pred_buf;
+  const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
+  const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
+  const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+                               0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
+
+  int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
+  int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
+  do {
+    sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
+    sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    if (width >= 16) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+    if (width == 32) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+  } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
+  int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
+
+  const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
+  sum_32x4 = vec_add(sum_32x4, perm_64);
+  const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
+  sum_32x4 = vec_add(sum_32x4, perm_32);
+  const int32x4_t avg = vec_sr(sum_32x4, div_shift);
+  const int16x8_t vec_avg = vec_pack(avg, avg);
+  do {
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg),
+               OFF_0 + CFL_BUF_LINE_BYTES, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg),
+               OFF_0 + CFL_LINE_2, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg),
+               OFF_0 + CFL_LINE_3, pred_buf);
+    if (width >= 16) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_3, pred_buf);
+    }
+    if (width == 32) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_3, pred_buf);
+
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_3, pred_buf);
+    }
+  } while ((pred_buf += CFL_BUF_LINE * 4) < end);
+}
+
+// Declare wrappers for VSX sizes
+CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
+CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
+CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
+CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
+CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
+CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
+CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
+
+// Based on observation, for small blocks VSX does not outperform C (no 64bit
+// load and store intrinsics). So we call the C code for block widths 4.
+cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    subtract_average_4x4_c,     /* 4x4 */
+    subtract_average_8x8_vsx,   /* 8x8 */
+    subtract_average_16x16_vsx, /* 16x16 */
+    subtract_average_32x32_vsx, /* 32x32 */
+    cfl_subtract_average_null,  /* 64x64 (invalid CFL size) */
+    subtract_average_4x8_c,     /* 4x8 */
+    subtract_average_8x4_vsx,   /* 8x4 */
+    subtract_average_8x16_vsx,  /* 8x16 */
+    subtract_average_16x8_vsx,  /* 16x8 */
+    subtract_average_16x32_vsx, /* 16x32 */
+    subtract_average_32x16_vsx, /* 32x16 */
+    cfl_subtract_average_null,  /* 32x64 (invalid CFL size) */
+    cfl_subtract_average_null,  /* 64x32 (invalid CFL size) */
+    subtract_average_4x16_c,    /* 4x16 */
+    subtract_average_16x4_vsx,  /* 16x4 */
+    subtract_average_8x32_vsx,  /* 8x32 */
+    subtract_average_32x8_vsx,  /* 32x8 */
+    cfl_subtract_average_null,  /* 16x64 (invalid CFL size) */
+    cfl_subtract_average_null,  /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 41b0057..4890641 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -450,4 +450,12 @@
                         ::testing::ValuesIn(subsample_sizes_neon));
 
 #endif
+
+#if HAVE_VSX
+const sub_avg_param sub_avg_sizes_vsx[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_vsx) };
+
+INSTANTIATE_TEST_CASE_P(VSX, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_vsx));
+#endif
 }  // namespace