Improve vpx_blend_* functions.

- Made source buffers pointers to const.
- Renamed vpx_blend_mask6b to vpx_blend_a64_mask. This is more
  indicative that the function does alpha blending. The 6, or 6b
  suffix was misleading, as the max mask value (64) does not fit into
  6 bits.
- Added VPX_BLEND_* macros to use when needing to blend scalars.
- Use VPX_BLEND_A256 in combine_interintra to be more explicit about
  the operation being done.
- Added versions of vpx_blend_a64_* which take 1D horizontal/vertical
  masks directly and apply them to all rows/columns
  (vpx_blend_a64_hmask and vpx_blend_a64_vmask). The SSE4.1 optimzied
  horizontal version now falls back on the 2D version. This can be
  improved upon if it show up high enough in a profile.
- All vpx_blend_a64_* functions now support block sizes down to 1x1
  (ie: a single pixel). This is for usage convenience. The SSE4.1
  optimized versions fall back on the C implementation if
  w <= 2 or h <= 2. This can again be improved if it becomes hot code.

Change-Id: I13ab3835146ffafe3e1d74d8e9cf64a5abe4144d
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
new file mode 100644
index 0000000..08ee91d
--- /dev/null
+++ b/test/blend_a64_mask_test.cc
@@ -0,0 +1,297 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/acm_random.h"
+#include "vp10/common/enums.h"
+
+#include "vpx_dsp/blend.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::FunctionEquivalenceTest;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
+
+  BlendA64MaskTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  virtual ~BlendA64MaskTest() {}
+
+  virtual void Execute(const T *p_src0, const T *p_src1) = 0;
+
+  void Common() {
+    w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+    h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+
+    subx_ = rng_(2);
+    suby_ = rng_(2);
+
+    dst_offset_ = rng_(33);
+    dst_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = rng_(33);
+    src0_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = rng_(33);
+    src1_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    mask_stride_ = rng_(kMaxWidth + 1 - w_ * (subx_ ? 2 : 1))
+                  + w_ * (subx_ ? 2 : 1);
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (rng_(3)) {
+      case 0:   // Separate sources
+        p_src0 = src0_;
+        p_src1 = src1_;
+        break;
+      case 1:   // src0 == dst
+        p_src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        p_src1 = src1_;
+        break;
+      case 2:   // src1 == dst
+        p_src0 = src0_;
+        p_src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default:
+        FAIL();
+    }
+
+    Execute(p_src0, p_src1);
+
+    for (int r = 0 ; r < h_ ; ++r) {
+      for (int c = 0 ; c < w_ ; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+      }
+    }
+  }
+
+  ACMRandom rng_;
+
+  T dst_ref_[kBufSize];
+  T dst_tst_[kBufSize];
+  size_t dst_stride_;
+  size_t dst_offset_;
+
+  T src0_[kBufSize];
+  size_t src0_stride_;
+  size_t src0_offset_;
+
+  T src1_[kBufSize];
+  size_t src1_stride_;
+  size_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+  size_t mask_stride_;
+
+  int w_;
+  int h_;
+
+  bool suby_;
+  bool subx_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+                    const uint8_t *src0, uint32_t src0_stride,
+                    const uint8_t *src1, uint32_t src1_stride,
+                    const uint8_t *mask, uint32_t mask_stride,
+                    int h, int w, int suby, int subx);
+
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t> {
+ protected:
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
+    ref_func_(dst_ref_ + dst_offset_, dst_stride_,
+              p_src0 + src0_offset_, src0_stride_,
+              p_src1 + src1_offset_, src1_stride_,
+              mask_, kMaxMaskWidth,
+              h_, w_, suby_, subx_);
+
+    tst_func_(dst_tst_ + dst_offset_, dst_stride_,
+              p_src0 + src0_offset_, src0_stride_,
+              p_src1 + src1_offset_, src1_stride_,
+              mask_, kMaxMaskWidth,
+              h_, w_, suby_, subx_);
+  }
+};
+
+TEST_P(BlendA64MaskTest8B, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendA64MaskTest8B,
+  ::testing::Values(make_tuple(vpx_blend_a64_mask_c,
+                               vpx_blend_a64_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+                     const uint8_t *src0, uint32_t src0_stride,
+                     const uint8_t *src1, uint32_t src1_stride,
+                     const uint8_t *mask, uint32_t mask_stride,
+                     int h, int w, int suby, int subx, int bd);
+
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t> {
+ protected:
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+    ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+              CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+              CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+              mask_, kMaxMaskWidth,
+              h_, w_, suby_, subx_, bit_depth_);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+                CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                mask_, kMaxMaskWidth,
+                h_, w_, suby_, subx_, bit_depth_));
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
+  for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendA64MaskTestHBD,
+  ::testing::Values(make_tuple(vpx_highbd_blend_a64_mask_c,
+                               vpx_highbd_blend_a64_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace