Add subtract_block SSE2 version and unit test.

3% faster overall (3min35.0 to 3min28.5).

Change-Id: I5ff8a5c2c91586b6632ca5009ad1ea51ce94af5e
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
new file mode 100644
index 0000000..1a62b57
--- /dev/null
+++ b/test/vp9_subtract_test.cc
@@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+extern "C" {
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+}
+
+typedef void (*subtract_fn_t)(int rows, int cols,
+                              int16_t *diff_ptr, ptrdiff_t diff_stride,
+                              const uint8_t *src_ptr, ptrdiff_t src_stride,
+                              const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+
+namespace vp9 {
+
+class VP9SubtractBlockTest : public ::testing::TestWithParam<subtract_fn_t> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+};
+
+using libvpx_test::ACMRandom;
+
+TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  // FIXME(rbultje) split in its own file
+  for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
+       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+    const int block_width  = 4 << b_width_log2(bsize);
+    const int block_height = 4 << b_height_log2(bsize);
+    int16_t *diff = new int16_t[block_width * block_height * 2];
+    uint8_t *pred = new uint8_t[block_width * block_height * 2];
+    uint8_t *src  = new uint8_t[block_width * block_height * 2];
+
+    for (int n = 0; n < 100; n++) {
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width * 2; ++c) {
+          src[r * block_width * 2 + c] = rnd.Rand8();
+          pred[r * block_width * 2 + c] = rnd.Rand8();
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width,
+                 src, block_width, pred, block_width);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width + c],
+                    (src[r * block_width + c] -
+                     pred[r * block_width + c])) << "r = " << r
+                                                 << ", c = " << c
+                                                 << ", bs = " << bsize;
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width * 2,
+                 src, block_width * 2, pred, block_width * 2);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width * 2 + c],
+                    (src[r * block_width * 2 + c] -
+                     pred[r * block_width * 2 + c])) << "r = " << r
+                                                     << ", c = " << c
+                                                     << ", bs = " << bsize;
+        }
+      }
+    }
+    delete[] diff;
+    delete[] pred;
+    delete[] src;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
+                        ::testing::Values(vp9_subtract_block_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
+                        ::testing::Values(vp9_subtract_block_sse2));
+#endif
+
+}  // namespace vp9