Merge "build: modify default ARFLAGS / .a target"
diff --git a/configure b/configure
index 30a5c11..0e01a2e 100755
--- a/configure
+++ b/configure
@@ -264,6 +264,7 @@
spatial_svc
fp_mb_stats
emulate_hardware
+ misc_fixes
"
CONFIG_LIST="
dependency_tracking
diff --git a/examples.mk b/examples.mk
index dfa5a65..f10bec6 100644
--- a/examples.mk
+++ b/examples.mk
@@ -36,6 +36,8 @@
third_party/libyuv/source/scale_neon64.cc \
third_party/libyuv/source/scale_win.cc \
+LIBWEBM_COMMON_SRCS += third_party/libwebm/webmids.hpp
+
LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
third_party/libwebm/mkvmuxerutil.cpp \
third_party/libwebm/mkvwriter.cpp \
@@ -43,8 +45,7 @@
third_party/libwebm/mkvmuxertypes.hpp \
third_party/libwebm/mkvmuxerutil.hpp \
third_party/libwebm/mkvparser.hpp \
- third_party/libwebm/mkvwriter.hpp \
- third_party/libwebm/webmids.hpp
+ third_party/libwebm/mkvwriter.hpp
LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
third_party/libwebm/mkvreader.cpp \
@@ -68,6 +69,7 @@
vpxdec.SRCS += $(LIBYUV_SRCS)
endif
ifeq ($(CONFIG_WEBM_IO),yes)
+ vpxdec.SRCS += $(LIBWEBM_COMMON_SRCS)
vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS)
vpxdec.SRCS += webmdec.cc webmdec.h
endif
@@ -89,6 +91,7 @@
vpxenc.SRCS += $(LIBYUV_SRCS)
endif
ifeq ($(CONFIG_WEBM_IO),yes)
+ vpxenc.SRCS += $(LIBWEBM_COMMON_SRCS)
vpxenc.SRCS += $(LIBWEBM_MUXER_SRCS)
vpxenc.SRCS += webmenc.cc webmenc.h
endif
diff --git a/test/test.mk b/test/test.mk
index 6bb08be..3361c91 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -167,6 +167,7 @@
TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
endif # CONFIG_SHARED
include $(SRC_PATH_BARE)/test/test-data.mk
diff --git a/test/vp10_inv_txfm_test.cc b/test/vp10_inv_txfm_test.cc
new file mode 100644
index 0000000..c49081e
--- /dev/null
+++ b/test/vp10_inv_txfm_test.cc
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/scan.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const double PI = 3.141592653589793238462643383279502884;
+const double kInvSqrt2 = 0.707106781186547524400844362104;
+
+void reference_idct_1d(const double *in, double *out, int size) {
+ for (int n = 0; n < size; ++n) {
+ out[n] = 0;
+ for (int k = 0; k < size; ++k) {
+ if (k == 0)
+ out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+ else
+ out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+ }
+ }
+}
+
+typedef void (*IdctFuncRef)(const double *in, double *out, int size);
+typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
+
+class TransTestBase {
+ public:
+ virtual ~TransTestBase() {}
+
+ protected:
+ void RunInvAccuracyCheck() {
+ tran_low_t *input = new tran_low_t[txfm_size_];
+ tran_low_t *output = new tran_low_t[txfm_size_];
+ double *ref_input = new double[txfm_size_];
+ double *ref_output = new double[txfm_size_];
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+ for (int ti = 0; ti < count_test_block; ++ti) {
+ for (int ni = 0; ni < txfm_size_; ++ni) {
+ input[ni] = rnd.Rand8() - rnd.Rand8();
+ ref_input[ni] = static_cast<double>(input[ni]);
+ }
+
+ fwd_txfm_(input, output);
+ fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
+
+ for (int ni = 0; ni < txfm_size_; ++ni) {
+ EXPECT_LE(
+ abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
+ max_error_);
+ }
+ }
+
+ delete[] input;
+ delete[] output;
+ delete[] ref_input;
+ delete[] ref_output;
+ }
+
+ double max_error_;
+ int txfm_size_;
+ IdctFunc fwd_txfm_;
+ IdctFuncRef fwd_txfm_ref_;
+};
+
+typedef std::tr1::tuple<IdctFunc, IdctFuncRef, int, int> IdctParam;
+class Vp10InvTxfm
+ : public TransTestBase,
+ public ::testing::TestWithParam<IdctParam> {
+ public:
+ virtual void SetUp() {
+ fwd_txfm_ = GET_PARAM(0);
+ fwd_txfm_ref_ = GET_PARAM(1);
+ txfm_size_ = GET_PARAM(2);
+ max_error_ = GET_PARAM(3);
+ }
+ virtual void TearDown() {}
+};
+
+TEST_P(Vp10InvTxfm, RunInvAccuracyCheck) {
+ RunInvAccuracyCheck();
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, Vp10InvTxfm,
+ ::testing::Values(
+ IdctParam(&vp10_idct4_c, &reference_idct_1d, 4, 1),
+ IdctParam(&vp10_idct8_c, &reference_idct_1d, 8, 2),
+ IdctParam(&vp10_idct16_c, &reference_idct_1d, 16, 4),
+ IdctParam(&vp10_idct32_c, &reference_idct_1d, 32, 6))
+);
+
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef std::tr1::tuple<FwdTxfmFunc,
+ InvTxfmFunc,
+ InvTxfmFunc,
+ TX_SIZE, int> PartialInvTxfmParam;
+const int kMaxNumCoeffs = 1024;
+class Vp10PartialIDctTest
+ : public ::testing::TestWithParam<PartialInvTxfmParam> {
+ public:
+ virtual ~Vp10PartialIDctTest() {}
+ virtual void SetUp() {
+ ftxfm_ = GET_PARAM(0);
+ full_itxfm_ = GET_PARAM(1);
+ partial_itxfm_ = GET_PARAM(2);
+ tx_size_ = GET_PARAM(3);
+ last_nonzero_ = GET_PARAM(4);
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ int last_nonzero_;
+ TX_SIZE tx_size_;
+ FwdTxfmFunc ftxfm_;
+ InvTxfmFunc full_itxfm_;
+ InvTxfmFunc partial_itxfm_;
+};
+
+TEST_P(Vp10PartialIDctTest, RunQuantCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int size;
+ switch (tx_size_) {
+ case TX_4X4:
+ size = 4;
+ break;
+ case TX_8X8:
+ size = 8;
+ break;
+ case TX_16X16:
+ size = 16;
+ break;
+ case TX_32X32:
+ size = 32;
+ break;
+ default:
+ FAIL() << "Wrong Size!";
+ break;
+ }
+ DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+
+ const int count_test_block = 1000;
+ const int block_size = size * size;
+
+ DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
+
+ int max_error = 0;
+ for (int i = 0; i < count_test_block; ++i) {
+ // clear out destination buffer
+ memset(dst1, 0, sizeof(*dst1) * block_size);
+ memset(dst2, 0, sizeof(*dst2) * block_size);
+ memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+ memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ if (i == 0) {
+ for (int j = 0; j < block_size; ++j)
+ input_extreme_block[j] = 255;
+ } else if (i == 1) {
+ for (int j = 0; j < block_size; ++j)
+ input_extreme_block[j] = -255;
+ } else {
+ for (int j = 0; j < block_size; ++j) {
+ input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+ }
+ }
+
+ ftxfm_(input_extreme_block, output_ref_block, size);
+
+ // quantization with maximum allowed step sizes
+ test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+ for (int j = 1; j < last_nonzero_; ++j)
+ test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
+ = (output_ref_block[j] / 1828) * 1828;
+ }
+
+ ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+ ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+
+ for (int j = 0; j < block_size; ++j) {
+ const int diff = dst1[j] - dst2[j];
+ const int error = diff * diff;
+ if (max_error < error)
+ max_error = error;
+ }
+ }
+
+ EXPECT_EQ(0, max_error)
+ << "Error: partial inverse transform produces different results";
+}
+
+TEST_P(Vp10PartialIDctTest, ResultsMatch) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int size;
+ switch (tx_size_) {
+ case TX_4X4:
+ size = 4;
+ break;
+ case TX_8X8:
+ size = 8;
+ break;
+ case TX_16X16:
+ size = 16;
+ break;
+ case TX_32X32:
+ size = 32;
+ break;
+ default:
+ FAIL() << "Wrong Size!";
+ break;
+ }
+ DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+ const int count_test_block = 1000;
+ const int max_coeff = 32766 / 4;
+ const int block_size = size * size;
+ int max_error = 0;
+ for (int i = 0; i < count_test_block; ++i) {
+ // clear out destination buffer
+ memset(dst1, 0, sizeof(*dst1) * block_size);
+ memset(dst2, 0, sizeof(*dst2) * block_size);
+ memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+ memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+ int max_energy_leftover = max_coeff * max_coeff;
+ for (int j = 0; j < last_nonzero_; ++j) {
+ int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+ (rnd.Rand16() - 32768) / 65536);
+ max_energy_leftover -= coef * coef;
+ if (max_energy_leftover < 0) {
+ max_energy_leftover = 0;
+ coef = 0;
+ }
+ test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
+ }
+
+ memcpy(test_coef_block2, test_coef_block1,
+ sizeof(*test_coef_block2) * block_size);
+
+ ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+ ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+
+ for (int j = 0; j < block_size; ++j) {
+ const int diff = dst1[j] - dst2[j];
+ const int error = diff * diff;
+ if (max_error < error)
+ max_error = error;
+ }
+ }
+
+ EXPECT_EQ(0, max_error)
+ << "Error: partial inverse transform produces different results";
+}
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+ C, Vp10PartialIDctTest,
+ ::testing::Values(
+ make_tuple(&vpx_fdct32x32_c,
+ &vp10_idct32x32_1024_add_c,
+ &vp10_idct32x32_34_add_c,
+ TX_32X32, 34),
+ make_tuple(&vpx_fdct32x32_c,
+ &vp10_idct32x32_1024_add_c,
+ &vp10_idct32x32_1_add_c,
+ TX_32X32, 1),
+ make_tuple(&vpx_fdct16x16_c,
+ &vp10_idct16x16_256_add_c,
+ &vp10_idct16x16_10_add_c,
+ TX_16X16, 10),
+ make_tuple(&vpx_fdct16x16_c,
+ &vp10_idct16x16_256_add_c,
+ &vp10_idct16x16_1_add_c,
+ TX_16X16, 1),
+ make_tuple(&vpx_fdct8x8_c,
+ &vp10_idct8x8_64_add_c,
+ &vp10_idct8x8_12_add_c,
+ TX_8X8, 12),
+ make_tuple(&vpx_fdct8x8_c,
+ &vp10_idct8x8_64_add_c,
+ &vp10_idct8x8_1_add_c,
+ TX_8X8, 1),
+ make_tuple(&vpx_fdct4x4_c,
+ &vp10_idct4x4_16_add_c,
+ &vp10_idct4x4_1_add_c,
+ TX_4X4, 1)));
+} // namespace
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 91875e1..f07a52b 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,7 +1,10 @@
URL: https://chromium.googlesource.com/webm/libwebm
-Version: 2dec09426ab62b794464cc9971bd135b4d313e65
+Version: a58c32339e06e5d672a58cdd5844cea0a661e735
License: BSD
License File: LICENSE.txt
Description:
libwebm is used to handle WebM container I/O.
+
+Local Changes:
+* <none>
diff --git a/third_party/libwebm/mkvmuxer.hpp b/third_party/libwebm/mkvmuxer.hpp
index 497ad4c..03a002c 100644
--- a/third_party/libwebm/mkvmuxer.hpp
+++ b/third_party/libwebm/mkvmuxer.hpp
@@ -528,7 +528,7 @@
public:
// Audio and video type defined by the Matroska specs.
enum { kVideo = 0x1, kAudio = 0x2 };
- // Opus, Vorbis, VP8, and VP9 codec ids defined by the Matroska specs.
+
static const char kOpusCodecId[];
static const char kVorbisCodecId[];
static const char kVp8CodecId[];
diff --git a/third_party/libwebm/mkvparser.cpp b/third_party/libwebm/mkvparser.cpp
index fc01be5..4306a51 100644
--- a/third_party/libwebm/mkvparser.cpp
+++ b/third_party/libwebm/mkvparser.cpp
@@ -7,45 +7,51 @@
// be found in the AUTHORS file in the root of the source tree.
#include "mkvparser.hpp"
+
#include <cassert>
+#include <climits>
+#include <cmath>
#include <cstring>
#include <new>
-#include <climits>
+
+#include "webmids.hpp"
#ifdef _MSC_VER
// Disable MSVC warnings that suggest making code non-portable.
#pragma warning(disable : 4996)
#endif
-mkvparser::IMkvReader::~IMkvReader() {}
+namespace mkvparser {
-void mkvparser::GetVersion(int& major, int& minor, int& build, int& revision) {
+IMkvReader::~IMkvReader() {}
+
+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+ unsigned long long element_size) {
+ if (num_elements == 0 || element_size == 0)
+ return NULL;
+
+ const size_t kMaxAllocSize = 0x80000000; // 2GiB
+ const unsigned long long num_bytes = num_elements * element_size;
+ if (element_size > (kMaxAllocSize / num_elements))
+ return NULL;
+
+ return new (std::nothrow) Type[num_bytes];
+}
+
+void GetVersion(int& major, int& minor, int& build, int& revision) {
major = 1;
minor = 0;
build = 0;
revision = 30;
}
-long long mkvparser::ReadUInt(IMkvReader* pReader, long long pos, long& len) {
- assert(pReader);
- assert(pos >= 0);
-
- int status;
-
- //#ifdef _DEBUG
- // long long total, available;
- // status = pReader->Length(&total, &available);
- // assert(status >= 0);
- // assert((total < 0) || (available <= total));
- // assert(pos < available);
- // assert((available - pos) >= 1); //assume here max u-int len is 8
- //#endif
+long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
+ if (!pReader || pos < 0)
+ return E_FILE_FORMAT_INVALID;
len = 1;
-
unsigned char b;
-
- status = pReader->Read(pos, 1, &b);
+ int status = pReader->Read(pos, 1, &b);
if (status < 0) // error or underflow
return status;
@@ -63,10 +69,6 @@
++len;
}
- //#ifdef _DEBUG
- // assert((available - pos) >= len);
- //#endif
-
long long result = b & (~m);
++pos;
@@ -92,16 +94,76 @@
return result;
}
-long long mkvparser::GetUIntLength(IMkvReader* pReader, long long pos,
- long& len) {
- assert(pReader);
- assert(pos >= 0);
+// Reads an EBML ID and returns it.
+// An ID must at least 1 byte long, cannot exceed 4, and its value must be
+// greater than 0.
+// See known EBML values and EBMLMaxIDLength:
+// http://www.matroska.org/technical/specs/index.html
+// Returns the ID, or a value less than 0 to report an error while reading the
+// ID.
+long long ReadID(IMkvReader* pReader, long long pos, long& len) {
+ if (pReader == NULL || pos < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ // Read the first byte. The length in bytes of the ID is determined by
+ // finding the first set bit in the first byte of the ID.
+ unsigned char temp_byte = 0;
+ int read_status = pReader->Read(pos, 1, &temp_byte);
+
+ if (read_status < 0)
+ return E_FILE_FORMAT_INVALID;
+ else if (read_status > 0) // No data to read.
+ return E_BUFFER_NOT_FULL;
+
+ if (temp_byte == 0) // ID length > 8 bytes; invalid file.
+ return E_FILE_FORMAT_INVALID;
+
+ int bit_pos = 0;
+ const int kMaxIdLengthInBytes = 4;
+ const int kCheckByte = 0x80;
+
+ // Find the first bit that's set.
+ bool found_bit = false;
+ for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) {
+ if ((kCheckByte >> bit_pos) & temp_byte) {
+ found_bit = true;
+ break;
+ }
+ }
+
+ if (!found_bit) {
+ // The value is too large to be a valid ID.
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ // Read the remaining bytes of the ID (if any).
+ const int id_length = bit_pos + 1;
+ long long ebml_id = temp_byte;
+ for (int i = 1; i < id_length; ++i) {
+ ebml_id <<= 8;
+ read_status = pReader->Read(pos + i, 1, &temp_byte);
+
+ if (read_status < 0)
+ return E_FILE_FORMAT_INVALID;
+ else if (read_status > 0)
+ return E_BUFFER_NOT_FULL;
+
+ ebml_id |= temp_byte;
+ }
+
+ len = id_length;
+ return ebml_id;
+}
+
+long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) {
+ if (!pReader || pos < 0)
+ return E_FILE_FORMAT_INVALID;
long long total, available;
int status = pReader->Length(&total, &available);
- assert(status >= 0);
- assert((total < 0) || (available <= total));
+ if (status < 0 || (total >= 0 && available > total))
+ return E_FILE_FORMAT_INVALID;
len = 1;
@@ -112,11 +174,9 @@
status = pReader->Read(pos, 1, &b);
- if (status < 0)
+ if (status != 0)
return status;
- assert(status == 0);
-
if (b == 0) // we can't handle u-int values larger than 8 bytes
return E_FILE_FORMAT_INVALID;
@@ -132,12 +192,8 @@
// TODO(vigneshv): This function assumes that unsigned values never have their
// high bit set.
-long long mkvparser::UnserializeUInt(IMkvReader* pReader, long long pos,
- long long size) {
- assert(pReader);
- assert(pos >= 0);
-
- if ((size <= 0) || (size > 8))
+long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) {
+ if (!pReader || pos < 0 || (size <= 0) || (size > 8))
return E_FILE_FORMAT_INVALID;
long long result = 0;
@@ -159,12 +215,9 @@
return result;
}
-long mkvparser::UnserializeFloat(IMkvReader* pReader, long long pos,
- long long size_, double& result) {
- assert(pReader);
- assert(pos >= 0);
-
- if ((size_ != 4) && (size_ != 8))
+long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
+ double& result) {
+ if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8)))
return E_FILE_FORMAT_INVALID;
const long size = static_cast<long>(size_);
@@ -195,8 +248,6 @@
result = f;
} else {
- assert(size == 8);
-
union {
double d;
unsigned long long dd;
@@ -216,28 +267,25 @@
result = d;
}
+ if (std::isinf(result) || std::isnan(result))
+ return E_FILE_FORMAT_INVALID;
+
return 0;
}
-long mkvparser::UnserializeInt(IMkvReader* pReader, long long pos,
- long long size, long long& result) {
- assert(pReader);
- assert(pos >= 0);
- assert(size > 0);
- assert(size <= 8);
+long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
+ long long& result_ref) {
+ if (!pReader || pos < 0 || size < 1 || size > 8)
+ return E_FILE_FORMAT_INVALID;
- {
- signed char b;
+ signed char first_byte = 0;
+ const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte);
- const long status = pReader->Read(pos, 1, (unsigned char*)&b);
+ if (status < 0)
+ return status;
- if (status < 0)
- return status;
-
- result = b;
-
- ++pos;
- }
+ unsigned long long result = first_byte;
+ ++pos;
for (long i = 1; i < size; ++i) {
unsigned char b;
@@ -253,23 +301,24 @@
++pos;
}
- return 0; // success
+ result_ref = static_cast<long long>(result);
+ return 0;
}
-long mkvparser::UnserializeString(IMkvReader* pReader, long long pos,
- long long size_, char*& str) {
+long UnserializeString(IMkvReader* pReader, long long pos, long long size,
+ char*& str) {
delete[] str;
str = NULL;
- if (size_ >= LONG_MAX) // we need (size+1) chars
+ if (size >= LONG_MAX || size < 0)
return E_FILE_FORMAT_INVALID;
- const long size = static_cast<long>(size_);
+ // +1 for '\0' terminator
+ const long required_size = static_cast<long>(size) + 1;
- str = new (std::nothrow) char[size + 1];
-
+ str = SafeArrayAlloc<char>(1, required_size);
if (str == NULL)
- return -1;
+ return E_FILE_FORMAT_INVALID;
unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
@@ -282,137 +331,149 @@
return status;
}
- str[size] = '\0';
-
- return 0; // success
+ str[required_size - 1] = '\0';
+ return 0;
}
-long mkvparser::ParseElementHeader(IMkvReader* pReader, long long& pos,
- long long stop, long long& id,
- long long& size) {
- if ((stop >= 0) && (pos >= stop))
+long ParseElementHeader(IMkvReader* pReader, long long& pos,
+ long long stop, long long& id,
+ long long& size) {
+ if (stop >= 0 && pos >= stop)
return E_FILE_FORMAT_INVALID;
long len;
- id = ReadUInt(pReader, pos, len);
+ id = ReadID(pReader, pos, len);
if (id < 0)
return E_FILE_FORMAT_INVALID;
pos += len; // consume id
- if ((stop >= 0) && (pos >= stop))
+ if (stop >= 0 && pos >= stop)
return E_FILE_FORMAT_INVALID;
size = ReadUInt(pReader, pos, len);
- if (size < 0)
+ if (size < 0 || len < 1 || len > 8) {
+ // Invalid: Negative payload size, negative or 0 length integer, or integer
+ // larger than 64 bits (libwebm cannot handle them).
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ // Avoid rolling over pos when very close to LONG_LONG_MAX.
+ const unsigned long long rollover_check =
+ static_cast<unsigned long long>(pos) + len;
+ if (rollover_check > LONG_LONG_MAX)
return E_FILE_FORMAT_INVALID;
pos += len; // consume length of size
// pos now designates payload
- if ((stop >= 0) && ((pos + size) > stop))
+ if (stop >= 0 && pos >= stop)
return E_FILE_FORMAT_INVALID;
return 0; // success
}
-bool mkvparser::Match(IMkvReader* pReader, long long& pos, unsigned long id_,
- long long& val) {
- assert(pReader);
- assert(pos >= 0);
-
- long long total, available;
-
- const long status = pReader->Length(&total, &available);
- assert(status >= 0);
- assert((total < 0) || (available <= total));
- if (status < 0)
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+ long long& val) {
+ if (!pReader || pos < 0)
return false;
- long len;
+ long long total = 0;
+ long long available = 0;
- const long long id = ReadUInt(pReader, pos, len);
- assert(id >= 0);
- assert(len > 0);
- assert(len <= 8);
- assert((pos + len) <= available);
+ const long status = pReader->Length(&total, &available);
+ if (status < 0 || (total >= 0 && available > total))
+ return false;
- if ((unsigned long)id != id_)
+ long len = 0;
+
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (available - pos) > len)
+ return false;
+
+ if (static_cast<unsigned long>(id) != expected_id)
return false;
pos += len; // consume id
const long long size = ReadUInt(pReader, pos, len);
- assert(size >= 0);
- assert(size <= 8);
- assert(len > 0);
- assert(len <= 8);
- assert((pos + len) <= available);
+ if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len)
+ return false;
pos += len; // consume length of size of payload
val = UnserializeUInt(pReader, pos, size);
- assert(val >= 0);
+ if (val < 0)
+ return false;
pos += size; // consume size of payload
return true;
}
-bool mkvparser::Match(IMkvReader* pReader, long long& pos, unsigned long id_,
- unsigned char*& buf, size_t& buflen) {
- assert(pReader);
- assert(pos >= 0);
-
- long long total, available;
-
- long status = pReader->Length(&total, &available);
- assert(status >= 0);
- assert((total < 0) || (available <= total));
- if (status < 0)
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+ unsigned char*& buf, size_t& buflen) {
+ if (!pReader || pos < 0)
return false;
- long len;
- const long long id = ReadUInt(pReader, pos, len);
- assert(id >= 0);
- assert(len > 0);
- assert(len <= 8);
- assert((pos + len) <= available);
+ long long total = 0;
+ long long available = 0;
- if ((unsigned long)id != id_)
+ long status = pReader->Length(&total, &available);
+ if (status < 0 || (total >= 0 && available > total))
+ return false;
+
+ long len = 0;
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (available - pos) > len)
+ return false;
+
+ if (static_cast<unsigned long>(id) != expected_id)
return false;
pos += len; // consume id
- const long long size_ = ReadUInt(pReader, pos, len);
- assert(size_ >= 0);
- assert(len > 0);
- assert(len <= 8);
- assert((pos + len) <= available);
+ const long long size = ReadUInt(pReader, pos, len);
+ if (size < 0 || len <= 0 || len > 8 || (available - pos) > len)
+ return false;
+
+ unsigned long long rollover_check =
+ static_cast<unsigned long long>(pos) + len;
+ if (rollover_check > LONG_LONG_MAX)
+ return false;
pos += len; // consume length of size of payload
- assert((pos + size_) <= available);
- const long buflen_ = static_cast<long>(size_);
+ rollover_check = static_cast<unsigned long long>(pos) + size;
+ if (rollover_check > LONG_LONG_MAX)
+ return false;
- buf = new (std::nothrow) unsigned char[buflen_];
- assert(buf); // TODO
+ if ((pos + size) > available)
+ return false;
+
+ if (size >= LONG_MAX)
+ return false;
+
+ const long buflen_ = static_cast<long>(size);
+
+ buf = SafeArrayAlloc<unsigned char>(1, buflen_);
+ if (!buf)
+ return false;
status = pReader->Read(pos, buflen_, buf);
- assert(status == 0); // TODO
+ if (status != 0)
+ return false;
buflen = buflen_;
- pos += size_; // consume size of payload
+ pos += size; // consume size of payload
return true;
}
-namespace mkvparser {
-
EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); }
EBMLHeader::~EBMLHeader() { delete[] m_docType; }
@@ -433,7 +494,8 @@
}
long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
- assert(pReader);
+ if (!pReader)
+ return E_FILE_FORMAT_INVALID;
long long total, available;
@@ -445,67 +507,45 @@
pos = 0;
long long end = (available >= 1024) ? 1024 : available;
- for (;;) {
- unsigned char b = 0;
+ // Scan until we find what looks like the first byte of the EBML header.
+ const int kMaxScanBytes = (available >= 1024) ? 1024 : available;
+ const unsigned char kEbmlByte0 = 0x1A;
+ unsigned char scan_byte = 0;
- while (pos < end) {
- status = pReader->Read(pos, 1, &b);
+ while (pos < kMaxScanBytes) {
+ status = pReader->Read(pos, 1, &scan_byte);
- if (status < 0) // error
- return status;
+ if (status < 0) // error
+ return status;
+ else if (status > 0)
+ return E_BUFFER_NOT_FULL;
- if (b == 0x1A)
- break;
-
- ++pos;
- }
-
- if (b != 0x1A) {
- if (pos >= 1024)
- return E_FILE_FORMAT_INVALID; // don't bother looking anymore
-
- if ((total >= 0) && ((total - available) < 5))
- return E_FILE_FORMAT_INVALID;
-
- return available + 5; // 5 = 4-byte ID + 1st byte of size
- }
-
- if ((total >= 0) && ((total - pos) < 5))
- return E_FILE_FORMAT_INVALID;
-
- if ((available - pos) < 5)
- return pos + 5; // try again later
-
- long len;
-
- const long long result = ReadUInt(pReader, pos, len);
-
- if (result < 0) // error
- return result;
-
- if (result == 0x0A45DFA3) { // EBML Header ID
- pos += len; // consume ID
+ if (scan_byte == kEbmlByte0)
break;
- }
- ++pos; // throw away just the 0x1A byte, and try again
+ ++pos;
}
- // pos designates start of size field
+ long len = 0;
+ const long long ebml_id = ReadID(pReader, pos, len);
- // get length of size field
+ // TODO(tomfinegan): Move Matroska ID constants into a common namespace.
+ if (len != 4 || ebml_id != mkvmuxer::kMkvEBML)
+ return E_FILE_FORMAT_INVALID;
- long len;
+ // Move read pos forward to the EBML header size field.
+ pos += 4;
+
+ // Read length of size field.
long long result = GetUIntLength(pReader, pos, len);
if (result < 0) // error
- return result;
+ return E_FILE_FORMAT_INVALID;
+ else if (result > 0) // need more data
+ return E_BUFFER_NOT_FULL;
- if (result > 0) // need more data
- return result;
-
- assert(len > 0);
- assert(len <= 8);
+ if (len < 1 || len > 8)
+ return E_FILE_FORMAT_INVALID;
if ((total >= 0) && ((total - pos) < len))
return E_FILE_FORMAT_INVALID;
@@ -513,8 +553,7 @@
if ((available - pos) < len)
return pos + len; // try again later
- // get the EBML header size
-
+ // Read the EBML header size.
result = ReadUInt(pReader, pos, len);
if (result < 0) // error
@@ -542,30 +581,30 @@
if (status < 0) // error
return status;
- if (size == 0) // weird
+ if (size == 0)
return E_FILE_FORMAT_INVALID;
- if (id == 0x0286) { // version
+ if (id == mkvmuxer::kMkvEBMLVersion) {
m_version = UnserializeUInt(pReader, pos, size);
if (m_version <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x02F7) { // read version
+ } else if (id == mkvmuxer::kMkvEBMLReadVersion) {
m_readVersion = UnserializeUInt(pReader, pos, size);
if (m_readVersion <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x02F2) { // max id length
+ } else if (id == mkvmuxer::kMkvEBMLMaxIDLength) {
m_maxIdLength = UnserializeUInt(pReader, pos, size);
if (m_maxIdLength <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x02F3) { // max size length
+ } else if (id == mkvmuxer::kMkvEBMLMaxSizeLength) {
m_maxSizeLength = UnserializeUInt(pReader, pos, size);
if (m_maxSizeLength <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0282) { // doctype
+ } else if (id == mkvmuxer::kMkvDocType) {
if (m_docType)
return E_FILE_FORMAT_INVALID;
@@ -573,12 +612,12 @@
if (status) // error
return status;
- } else if (id == 0x0287) { // doctype version
+ } else if (id == mkvmuxer::kMkvDocTypeVersion) {
m_docTypeVersion = UnserializeUInt(pReader, pos, size);
if (m_docTypeVersion <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0285) { // doctype read version
+ } else if (id == mkvmuxer::kMkvDocTypeReadVersion) {
m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
if (m_docTypeReadVersion <= 0)
@@ -588,7 +627,18 @@
pos += size;
}
- assert(pos == end);
+ if (pos != end)
+ return E_FILE_FORMAT_INVALID;
+
+ // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid.
+ if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid.
+ if (m_maxIdLength <= 0 || m_maxIdLength > 4 ||
+ m_maxSizeLength <= 0 || m_maxSizeLength > 8)
+ return E_FILE_FORMAT_INVALID;
+
return 0;
}
@@ -621,8 +671,6 @@
while (i != j) {
Cluster* const p = *i++;
- assert(p);
-
delete p;
}
@@ -638,8 +686,8 @@
long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
Segment*& pSegment) {
- assert(pReader);
- assert(pos >= 0);
+ if (pReader == NULL || pos < 0)
+ return E_PARSE_FAILED;
pSegment = NULL;
@@ -691,10 +739,10 @@
return pos + len;
const long long idpos = pos;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
- if (id < 0) // error
- return id;
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
pos += len; // consume ID
@@ -723,7 +771,7 @@
// Handle "unknown size" for live streaming of webm files.
const long long unknown_size = (1LL << (7 * len)) - 1;
- if (id == 0x08538067) { // Segment ID
+ if (id == mkvmuxer::kMkvSegment) {
if (size == unknown_size)
size = -1;
@@ -733,12 +781,9 @@
else if ((pos + size) > total)
size = -1;
- pSegment = new (std::nothrow) Segment(pReader, idpos,
- // elem_size
- pos, size);
-
- if (pSegment == 0)
- return -1; // generic error
+ pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size);
+ if (pSegment == NULL)
+ return E_PARSE_FAILED;
return 0; // success
}
@@ -767,11 +812,15 @@
if (status < 0) // error
return status;
- assert((total < 0) || (available <= total));
+ if (total > 0 && available > total)
+ return E_FILE_FORMAT_INVALID;
const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
- assert((segment_stop < 0) || (total < 0) || (segment_stop <= total));
- assert((segment_stop < 0) || (m_pos <= segment_stop));
+
+ if ((segment_stop >= 0 && total >= 0 && segment_stop > total) ||
+ (segment_stop >= 0 && m_pos > segment_stop)) {
+ return E_FILE_FORMAT_INVALID;
+ }
for (;;) {
if ((total >= 0) && (m_pos >= total))
@@ -783,6 +832,11 @@
long long pos = m_pos;
const long long element_start = pos;
+ // Avoid rolling over pos when very close to LONG_LONG_MAX.
+ unsigned long long rollover_check = pos + 1ULL;
+ if (rollover_check > LONG_LONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
if ((pos + 1) > available)
return (pos + 1);
@@ -792,8 +846,10 @@
if (result < 0) // error
return result;
- if (result > 0) // underflow (weird)
+ if (result > 0) {
+ // MkvReader doesn't have enough data to satisfy this read attempt.
return (pos + 1);
+ }
if ((segment_stop >= 0) && ((pos + len) > segment_stop))
return E_FILE_FORMAT_INVALID;
@@ -802,12 +858,12 @@
return pos + len;
const long long idpos = pos;
- const long long id = ReadUInt(m_pReader, idpos, len);
+ const long long id = ReadID(m_pReader, idpos, len);
- if (id < 0) // error
- return id;
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
- if (id == 0x0F43B675) // Cluster ID
+ if (id == mkvmuxer::kMkvCluster)
break;
pos += len; // consume ID
@@ -821,8 +877,10 @@
if (result < 0) // error
return result;
- if (result > 0) // underflow (weird)
+ if (result > 0) {
+ // MkvReader doesn't have enough data to satisfy this read attempt.
return (pos + 1);
+ }
if ((segment_stop >= 0) && ((pos + len) > segment_stop))
return E_FILE_FORMAT_INVALID;
@@ -832,11 +890,19 @@
const long long size = ReadUInt(m_pReader, pos, len);
- if (size < 0) // error
+ if (size < 0 || len < 1 || len > 8) {
+ // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or
+ // len > 8 is true instead of checking this _everywhere_.
return size;
+ }
pos += len; // consume length of size of element
+ // Avoid rolling over pos when very close to LONG_LONG_MAX.
+ rollover_check = static_cast<unsigned long long>(pos) + size;
+ if (rollover_check > LONG_LONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
const long long element_size = size + pos - element_start;
// Pos now points to start of payload
@@ -849,7 +915,7 @@
if ((pos + size) > available)
return pos + size;
- if (id == 0x0549A966) { // Segment Info ID
+ if (id == mkvmuxer::kMkvInfo) {
if (m_pInfo)
return E_FILE_FORMAT_INVALID;
@@ -863,7 +929,7 @@
if (status)
return status;
- } else if (id == 0x0654AE6B) { // Tracks ID
+ } else if (id == mkvmuxer::kMkvTracks) {
if (m_pTracks)
return E_FILE_FORMAT_INVALID;
@@ -877,7 +943,7 @@
if (status)
return status;
- } else if (id == 0x0C53BB6B) { // Cues ID
+ } else if (id == mkvmuxer::kMkvCues) {
if (m_pCues == NULL) {
m_pCues = new (std::nothrow)
Cues(this, pos, size, element_start, element_size);
@@ -885,7 +951,7 @@
if (m_pCues == NULL)
return -1;
}
- } else if (id == 0x014D9B74) { // SeekHead ID
+ } else if (id == mkvmuxer::kMkvSeekHead) {
if (m_pSeekHead == NULL) {
m_pSeekHead = new (std::nothrow)
SeekHead(this, pos, size, element_start, element_size);
@@ -898,7 +964,7 @@
if (status)
return status;
}
- } else if (id == 0x0043A770) { // Chapters ID
+ } else if (id == mkvmuxer::kMkvChapters) {
if (m_pChapters == NULL) {
m_pChapters = new (std::nothrow)
Chapters(this, pos, size, element_start, element_size);
@@ -911,7 +977,7 @@
if (status)
return status;
}
- } else if (id == 0x0254C367) { // Tags ID
+ } else if (id == mkvmuxer::kMkvTags) {
if (m_pTags == NULL) {
m_pTags = new (std::nothrow)
Tags(this, pos, size, element_start, element_size);
@@ -929,7 +995,8 @@
m_pos = pos + size; // consume payload
}
- assert((segment_stop < 0) || (m_pos <= segment_stop));
+ if (segment_stop >= 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
if (m_pInfo == NULL) // TODO: liberalize this behavior
return E_FILE_FORMAT_INVALID;
@@ -960,7 +1027,8 @@
if (status < 0) // error
return status;
- assert((total < 0) || (avail <= total));
+ if (total >= 0 && avail > total)
+ return E_FILE_FORMAT_INVALID;
const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
@@ -988,7 +1056,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((segment_stop >= 0) && ((pos + len) > segment_stop))
@@ -998,10 +1066,10 @@
return E_BUFFER_NOT_FULL;
const long long idpos = pos;
- const long long id = ReadUInt(m_pReader, idpos, len);
+ const long long id = ReadID(m_pReader, idpos, len);
- if (id < 0) // error (or underflow)
- return static_cast<long>(id);
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
pos += len; // consume ID
@@ -1017,7 +1085,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((segment_stop >= 0) && ((pos + len) > segment_stop))
@@ -1035,7 +1103,8 @@
// pos now points to start of payload
- if (size == 0) { // weird
+ if (size == 0) {
+ // Missing element payload: move on.
m_pos = pos;
continue;
}
@@ -1047,24 +1116,30 @@
return E_FILE_FORMAT_INVALID;
}
- if (id == 0x0C53BB6B) { // Cues ID
- if (size == unknown_size)
- return E_FILE_FORMAT_INVALID; // TODO: liberalize
+ if (id == mkvmuxer::kMkvCues) {
+ if (size == unknown_size) {
+ // Cues element of unknown size: Not supported.
+ return E_FILE_FORMAT_INVALID;
+ }
if (m_pCues == NULL) {
const long long element_size = (pos - idpos) + size;
- m_pCues = new Cues(this, pos, size, idpos, element_size);
- assert(m_pCues); // TODO
+ m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size);
+ if (m_pCues == NULL)
+ return -1;
}
m_pos = pos + size; // consume payload
continue;
}
- if (id != 0x0F43B675) { // Cluster ID
+ if (id != mkvmuxer::kMkvCluster) {
+ // Besides the Segment, Libwebm allows only cluster elements of unknown
+ // size. Fail the parse upon encountering a non-cluster element reporting
+ // unknown size.
if (size == unknown_size)
- return E_FILE_FORMAT_INVALID; // TODO: liberalize
+ return E_FILE_FORMAT_INVALID;
m_pos = pos + size; // consume payload
continue;
@@ -1080,7 +1155,10 @@
break;
}
- assert(cluster_off >= 0); // have cluster
+ if (cluster_off < 0) {
+ // No cluster, die.
+ return E_FILE_FORMAT_INVALID;
+ }
long long pos_;
long len_;
@@ -1126,14 +1204,16 @@
const long idx = m_clusterCount;
if (m_clusterPreloadCount > 0) {
- assert(idx < m_clusterSize);
+ if (idx >= m_clusterSize)
+ return E_FILE_FORMAT_INVALID;
Cluster* const pCluster = m_clusters[idx];
- assert(pCluster);
- assert(pCluster->m_index < 0);
+ if (pCluster == NULL || pCluster->m_index >= 0)
+ return E_FILE_FORMAT_INVALID;
const long long off = pCluster->GetPosition();
- assert(off >= 0);
+ if (off < 0)
+ return E_FILE_FORMAT_INVALID;
if (off == cluster_off) { // preloaded already
if (status == 0) // no entries found
@@ -1155,7 +1235,8 @@
--m_clusterPreloadCount;
m_pos = pos; // consume payload
- assert((segment_stop < 0) || (m_pos <= segment_stop));
+ if (segment_stop >= 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
return 0; // success
}
@@ -1182,19 +1263,21 @@
// status > 0 means we have an entry
Cluster* const pCluster = Cluster::Create(this, idx, cluster_off);
- // element_size);
- assert(pCluster);
+ if (pCluster == NULL)
+ return -1;
- AppendCluster(pCluster);
- assert(m_clusters);
- assert(idx < m_clusterSize);
- assert(m_clusters[idx] == pCluster);
+ if (!AppendCluster(pCluster)) {
+ delete pCluster;
+ return -1;
+ }
if (cluster_size >= 0) {
pos += cluster_size;
m_pos = pos;
- assert((segment_stop < 0) || (m_pos <= segment_stop));
+
+ if (segment_stop > 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -1210,8 +1293,8 @@
}
long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
- assert(m_pos < 0);
- assert(m_pUnknownSize);
+ if (m_pos >= 0 || m_pUnknownSize == NULL)
+ return E_PARSE_FAILED;
const long status = m_pUnknownSize->Parse(pos, len);
@@ -1221,12 +1304,11 @@
if (status == 0) // parsed a block
return 2; // continue parsing
- assert(status > 0); // nothing left to parse of this cluster
-
const long long start = m_pUnknownSize->m_element_start;
-
const long long size = m_pUnknownSize->GetElementSize();
- assert(size >= 0);
+
+ if (size < 0)
+ return E_FILE_FORMAT_INVALID;
pos = start + size;
m_pos = pos;
@@ -1236,24 +1318,26 @@
return 2; // continue parsing
}
-void Segment::AppendCluster(Cluster* pCluster) {
- assert(pCluster);
- assert(pCluster->m_index >= 0);
+bool Segment::AppendCluster(Cluster* pCluster) {
+ if (pCluster == NULL || pCluster->m_index < 0)
+ return false;
const long count = m_clusterCount + m_clusterPreloadCount;
long& size = m_clusterSize;
- assert(size >= count);
-
const long idx = pCluster->m_index;
- assert(idx == m_clusterCount);
+
+ if (size < count || idx != m_clusterCount)
+ return false;
if (count >= size) {
const long n = (size <= 0) ? 2048 : 2 * size;
- Cluster** const qq = new Cluster*[n];
- Cluster** q = qq;
+ Cluster** const qq = new (std::nothrow) Cluster*[n];
+ if (qq == NULL)
+ return false;
+ Cluster** q = qq;
Cluster** p = m_clusters;
Cluster** const pp = p + count;
@@ -1267,18 +1351,18 @@
}
if (m_clusterPreloadCount > 0) {
- assert(m_clusters);
-
Cluster** const p = m_clusters + m_clusterCount;
- assert(*p);
- assert((*p)->m_index < 0);
+ if (*p == NULL || (*p)->m_index >= 0)
+ return false;
Cluster** q = p + m_clusterPreloadCount;
- assert(q < (m_clusters + size));
+ if (q >= (m_clusters + size))
+ return false;
for (;;) {
Cluster** const qq = q - 1;
- assert((*qq)->m_index < 0);
+ if ((*qq)->m_index >= 0)
+ return false;
*q = *qq;
q = qq;
@@ -1290,22 +1374,25 @@
m_clusters[idx] = pCluster;
++m_clusterCount;
+ return true;
}
-void Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
- assert(pCluster);
- assert(pCluster->m_index < 0);
- assert(idx >= m_clusterCount);
+bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
+ if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount)
+ return false;
const long count = m_clusterCount + m_clusterPreloadCount;
long& size = m_clusterSize;
- assert(size >= count);
+ if (size < count)
+ return false;
if (count >= size) {
const long n = (size <= 0) ? 2048 : 2 * size;
- Cluster** const qq = new Cluster*[n];
+ Cluster** const qq = new (std::nothrow) Cluster*[n];
+ if (qq == NULL)
+ return false;
Cluster** q = qq;
Cluster** p = m_clusters;
@@ -1320,17 +1407,20 @@
size = n;
}
- assert(m_clusters);
+ if (m_clusters == NULL)
+ return false;
Cluster** const p = m_clusters + idx;
Cluster** q = m_clusters + count;
- assert(q >= p);
- assert(q < (m_clusters + size));
+ if (q < p || q >= (m_clusters + size))
+ return false;
while (q > p) {
Cluster** const qq = q - 1;
- assert((*qq)->m_index < 0);
+
+ if ((*qq)->m_index >= 0)
+ return false;
*q = *qq;
q = qq;
@@ -1338,13 +1428,12 @@
m_clusters[idx] = pCluster;
++m_clusterPreloadCount;
+ return true;
}
long Segment::Load() {
- assert(m_clusters == NULL);
- assert(m_clusterSize == 0);
- assert(m_clusterCount == 0);
- // assert(m_size >= 0);
+ if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0)
+ return E_PARSE_FAILED;
// Outermost (level 0) segment object has been constructed,
// and pos designates start of payload. We need to find the
@@ -1358,8 +1447,8 @@
if (header_status > 0) // underflow
return E_BUFFER_NOT_FULL;
- assert(m_pInfo);
- assert(m_pTracks);
+ if (m_pInfo == NULL || m_pTracks == NULL)
+ return E_FILE_FORMAT_INVALID;
for (;;) {
const int status = LoadCluster();
@@ -1408,16 +1497,19 @@
if (status < 0) // error
return status;
- if (id == 0x0DBB) // SeekEntry ID
+ if (id == mkvmuxer::kMkvSeek)
++entry_count;
- else if (id == 0x6C) // Void ID
+ else if (id == mkvmuxer::kMkvVoid)
++void_element_count;
pos += size; // consume payload
- assert(pos <= stop);
+
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
m_entries = new (std::nothrow) Entry[entry_count];
@@ -1446,14 +1538,14 @@
if (status < 0) // error
return status;
- if (id == 0x0DBB) { // SeekEntry ID
+ if (id == mkvmuxer::kMkvSeek) {
if (ParseEntry(pReader, pos, size, pEntry)) {
Entry& e = *pEntry++;
e.element_start = idpos;
e.element_size = (pos + size) - idpos;
}
- } else if (id == 0x6C) { // Void ID
+ } else if (id == mkvmuxer::kMkvVoid) {
VoidElement& e = *pVoidElement++;
e.element_start = idpos;
@@ -1461,10 +1553,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
assert(count_ >= 0);
@@ -1553,9 +1647,9 @@
const long long idpos = pos;
- const long long id = ReadUInt(m_pReader, idpos, len);
+ const long long id = ReadID(m_pReader, idpos, len);
- if (id != 0x0C53BB6B) // Cues ID
+ if (id != mkvmuxer::kMkvCues)
return E_FILE_FORMAT_INVALID;
pos += len; // consume ID
@@ -1615,7 +1709,8 @@
m_pCues =
new (std::nothrow) Cues(this, pos, size, element_start, element_size);
- assert(m_pCues); // TODO
+ if (m_pCues == NULL)
+ return -1;
return 0; // success
}
@@ -1632,10 +1727,11 @@
// parse the container for the level-1 element ID
- const long long seekIdId = ReadUInt(pReader, pos, len);
- // seekIdId;
+ const long long seekIdId = ReadID(pReader, pos, len);
+ if (seekIdId < 0)
+ return false;
- if (seekIdId != 0x13AB) // SeekID ID
+ if (seekIdId != mkvmuxer::kMkvSeekID)
return false;
if ((pos + len) > stop)
@@ -1677,9 +1773,9 @@
pos += seekIdSize; // consume SeekID payload
- const long long seekPosId = ReadUInt(pReader, pos, len);
+ const long long seekPosId = ReadID(pReader, pos, len);
- if (seekPosId != 0x13AC) // SeekPos ID
+ if (seekPosId != mkvmuxer::kMkvSeekPosition)
return false;
if ((pos + len) > stop)
@@ -1757,8 +1853,8 @@
if (m_cue_points)
return true;
- assert(m_count == 0);
- assert(m_preload_count == 0);
+ if (m_count != 0 || m_preload_count != 0)
+ return false;
IMkvReader* const pReader = m_pSegment->m_pReader;
@@ -1772,7 +1868,7 @@
long len;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0 || (pos + len) > stop) {
return false;
}
@@ -1789,21 +1885,27 @@
return false;
}
- if (id == 0x3B) // CuePoint ID
- PreloadCuePoint(cue_points_size, idpos);
+ if (id == mkvmuxer::kMkvCuePoint) {
+ if (!PreloadCuePoint(cue_points_size, idpos))
+ return false;
+ }
pos += size; // skip payload
}
return true;
}
-void Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
- assert(m_count == 0);
+bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
+ if (m_count != 0)
+ return false;
if (m_preload_count >= cue_points_size) {
const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
- CuePoint** const qq = new CuePoint*[n];
+ CuePoint** const qq = new (std::nothrow) CuePoint*[n];
+ if (qq == NULL)
+ return false;
+
CuePoint** q = qq; // beginning of target
CuePoint** p = m_cue_points; // beginning of source
@@ -1818,14 +1920,15 @@
cue_points_size = n;
}
- CuePoint* const pCP = new CuePoint(m_preload_count, pos);
+ CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos);
+ if (pCP == NULL)
+ return false;
+
m_cue_points[m_preload_count++] = pCP;
+ return true;
}
bool Cues::LoadCuePoint() const {
- // odbgstream os;
- // os << "Cues::LoadCuePoint" << endl;
-
const long long stop = m_start + m_size;
if (m_pos >= stop)
@@ -1843,32 +1946,33 @@
long len;
- const long long id = ReadUInt(pReader, m_pos, len);
- assert(id >= 0); // TODO
- assert((m_pos + len) <= stop);
+ const long long id = ReadID(pReader, m_pos, len);
+ if (id < 0 || (m_pos + len) > stop)
+ return false;
m_pos += len; // consume ID
const long long size = ReadUInt(pReader, m_pos, len);
- assert(size >= 0);
- assert((m_pos + len) <= stop);
+ if (size < 0 || (m_pos + len) > stop)
+ return false;
m_pos += len; // consume Size field
- assert((m_pos + size) <= stop);
+ if ((m_pos + size) > stop)
+ return false;
- if (id != 0x3B) { // CuePoint ID
+ if (id != mkvmuxer::kMkvCuePoint) {
m_pos += size; // consume payload
- assert(m_pos <= stop);
+ if (m_pos > stop)
+ return false;
continue;
}
- assert(m_preload_count > 0);
+ if (m_preload_count < 1)
+ return false;
CuePoint* const pCP = m_cue_points[m_count];
- assert(pCP);
- assert((pCP->GetTimeCode() >= 0) || (-pCP->GetTimeCode() == idpos));
- if (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos))
+ if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos)))
return false;
if (!pCP->Load(pReader)) {
@@ -1879,24 +1983,18 @@
--m_preload_count;
m_pos += size; // consume payload
- assert(m_pos <= stop);
+ if (m_pos > stop)
+ return false;
return true; // yes, we loaded a cue point
}
- // return (m_pos < stop);
return false; // no, we did not load a cue point
}
bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
const CuePoint::TrackPosition*& pTP) const {
- assert(time_ns >= 0);
- assert(pTrack);
-
- if (m_cue_points == NULL)
- return false;
-
- if (m_count == 0)
+ if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0)
return false;
CuePoint** const ii = m_cue_points;
@@ -1906,7 +2004,8 @@
CuePoint** j = jj;
pCP = *i;
- assert(pCP);
+ if (pCP == NULL)
+ return false;
if (time_ns <= pCP->GetTime(m_pSegment)) {
pTP = pCP->Find(pTrack);
@@ -1920,10 +2019,12 @@
//[j, jj) > time_ns
CuePoint** const k = i + (j - i) / 2;
- assert(k < jj);
+ if (k >= jj)
+ return false;
CuePoint* const pCP = *k;
- assert(pCP);
+ if (pCP == NULL)
+ return false;
const long long t = pCP->GetTime(m_pSegment);
@@ -1932,16 +2033,17 @@
else
j = k;
- assert(i <= j);
+ if (i > j)
+ return false;
}
- assert(i == j);
- assert(i <= jj);
- assert(i > ii);
+ if (i != j || i > jj || i <= ii)
+ return false;
pCP = *--i;
- assert(pCP);
- assert(pCP->GetTime(m_pSegment) <= time_ns);
+
+ if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns)
+ return false;
// TODO: here and elsewhere, it's probably not correct to search
// for the cue point with this time, and then search for a matching
@@ -1956,55 +2058,50 @@
}
const CuePoint* Cues::GetFirst() const {
- if (m_cue_points == NULL)
- return NULL;
-
- if (m_count == 0)
+ if (m_cue_points == NULL || m_count == 0)
return NULL;
CuePoint* const* const pp = m_cue_points;
- assert(pp);
+ if (pp == NULL)
+ return NULL;
CuePoint* const pCP = pp[0];
- assert(pCP);
- assert(pCP->GetTimeCode() >= 0);
+ if (pCP == NULL || pCP->GetTimeCode() < 0)
+ return NULL;
return pCP;
}
const CuePoint* Cues::GetLast() const {
- if (m_cue_points == NULL)
- return NULL;
-
- if (m_count <= 0)
+ if (m_cue_points == NULL || m_count <= 0)
return NULL;
const long index = m_count - 1;
CuePoint* const* const pp = m_cue_points;
- assert(pp);
+ if (pp == NULL)
+ return NULL;
CuePoint* const pCP = pp[index];
- assert(pCP);
- assert(pCP->GetTimeCode() >= 0);
+ if (pCP == NULL || pCP->GetTimeCode() < 0)
+ return NULL;
return pCP;
}
const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
- if (pCurr == NULL)
+ if (pCurr == NULL || pCurr->GetTimeCode() < 0 ||
+ m_cue_points == NULL || m_count < 1) {
return NULL;
-
- assert(pCurr->GetTimeCode() >= 0);
- assert(m_cue_points);
- assert(m_count >= 1);
+ }
long index = pCurr->m_index;
- assert(index < m_count);
+ if (index >= m_count)
+ return NULL;
CuePoint* const* const pp = m_cue_points;
- assert(pp);
- assert(pp[index] == pCurr);
+ if (pp == NULL || pp[index] != pCurr)
+ return NULL;
++index;
@@ -2012,18 +2109,16 @@
return NULL;
CuePoint* const pNext = pp[index];
- assert(pNext);
- assert(pNext->GetTimeCode() >= 0);
+
+ if (pNext == NULL || pNext->GetTimeCode() < 0)
+ return NULL;
return pNext;
}
const BlockEntry* Cues::GetBlock(const CuePoint* pCP,
const CuePoint::TrackPosition* pTP) const {
- if (pCP == NULL)
- return NULL;
-
- if (pTP == NULL)
+ if (pCP == NULL || pTP == NULL)
return NULL;
return m_pSegment->GetBlock(*pCP, *pTP);
@@ -2070,11 +2165,15 @@
// assert(Cluster::HasBlockEntries(this, tp.m_pos));
Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos); //, -1);
- assert(pCluster);
+ if (pCluster == NULL)
+ return NULL;
const ptrdiff_t idx = i - m_clusters;
- PreloadCluster(pCluster, idx);
+ if (!PreloadCluster(pCluster, idx)) {
+ delete pCluster;
+ return NULL;
+ }
assert(m_clusters);
assert(m_clusterPreloadCount > 0);
assert(m_clusters[idx] == pCluster);
@@ -2125,12 +2224,15 @@
// assert(Cluster::HasBlockEntries(this, tp.m_pos));
Cluster* const pCluster = Cluster::Create(this, -1, requested_pos);
- //-1);
- assert(pCluster);
+ if (pCluster == NULL)
+ return NULL;
const ptrdiff_t idx = i - m_clusters;
- PreloadCluster(pCluster, idx);
+ if (!PreloadCluster(pCluster, idx)) {
+ delete pCluster;
+ return NULL;
+ }
assert(m_clusters);
assert(m_clusterPreloadCount > 0);
assert(m_clusters[idx] == pCluster);
@@ -2168,9 +2270,8 @@
{
long len;
- const long long id = ReadUInt(pReader, pos_, len);
- assert(id == 0x3B); // CuePoint ID
- if (id != 0x3B)
+ const long long id = ReadID(pReader, pos_, len);
+ if (id != mkvmuxer::kMkvCuePoint)
return false;
pos_ += len; // consume ID
@@ -2193,7 +2294,7 @@
while (pos < stop) {
long len;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if ((id < 0) || (pos + len > stop)) {
return false;
}
@@ -2210,10 +2311,10 @@
return false;
}
- if (id == 0x33) // CueTime ID
+ if (id == mkvmuxer::kMkvCueTime)
m_timecode = UnserializeUInt(pReader, pos, size);
- else if (id == 0x37) // CueTrackPosition(s) ID
+ else if (id == mkvmuxer::kMkvCueTrackPositions)
++m_track_positions_count;
pos += size; // consume payload
@@ -2227,7 +2328,9 @@
// << " timecode=" << m_timecode
// << endl;
- m_track_positions = new TrackPosition[m_track_positions_count];
+ m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count];
+ if (m_track_positions == NULL)
+ return false;
// Now parse track positions
@@ -2237,9 +2340,9 @@
while (pos < stop) {
long len;
- const long long id = ReadUInt(pReader, pos, len);
- assert(id >= 0);
- assert((pos + len) <= stop);
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (pos + len) > stop)
+ return false;
pos += len; // consume ID
@@ -2250,7 +2353,7 @@
pos += len; // consume Size field
assert((pos + size) <= stop);
- if (id == 0x37) { // CueTrackPosition(s) ID
+ if (id == mkvmuxer::kMkvCueTrackPositions) {
TrackPosition& tp = *p++;
if (!tp.Parse(pReader, pos, size)) {
return false;
@@ -2258,7 +2361,8 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return false;
}
assert(size_t(p - m_track_positions) == m_track_positions_count);
@@ -2281,7 +2385,7 @@
while (pos < stop) {
long len;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if ((id < 0) || ((pos + len) > stop)) {
return false;
}
@@ -2298,13 +2402,11 @@
return false;
}
- if (id == 0x77) // CueTrack ID
+ if (id == mkvmuxer::kMkvCueTrack)
m_track = UnserializeUInt(pReader, pos, size);
-
- else if (id == 0x71) // CueClusterPos ID
+ else if (id == mkvmuxer::kMkvCueClusterPosition)
m_pos = UnserializeUInt(pReader, pos, size);
-
- else if (id == 0x1378) // CueBlockNumber
+ else if (id == mkvmuxer::kMkvCueBlockNumber)
m_block = UnserializeUInt(pReader, pos, size);
pos += size; // consume payload
@@ -2437,9 +2539,8 @@
if (result != 0)
return NULL;
- const long long id = ReadUInt(m_pReader, pos, len);
- assert(id == 0x0F43B675); // Cluster ID
- if (id != 0x0F43B675)
+ const long long id = ReadID(m_pReader, pos, len);
+ if (id != mkvmuxer::kMkvCluster)
return NULL;
pos += len; // consume ID
@@ -2474,8 +2575,9 @@
const long long idpos = pos; // pos of next (potential) cluster
- const long long id = ReadUInt(m_pReader, idpos, len);
- assert(id > 0); // TODO
+ const long long id = ReadID(m_pReader, idpos, len);
+ if (id < 0)
+ return NULL;
pos += len; // consume ID
@@ -2495,7 +2597,7 @@
if (size == 0) // weird
continue;
- if (id == 0x0F43B675) { // Cluster ID
+ if (id == mkvmuxer::kMkvCluster) {
const long long off_next_ = idpos - m_start;
long long pos_;
@@ -2553,11 +2655,15 @@
assert(i == j);
Cluster* const pNext = Cluster::Create(this, -1, off_next);
- assert(pNext);
+ if (pNext == NULL)
+ return NULL;
const ptrdiff_t idx_next = i - m_clusters; // insertion position
- PreloadCluster(pNext, idx_next);
+ if (!PreloadCluster(pNext, idx_next)) {
+ delete pNext;
+ return NULL;
+ }
assert(m_clusters);
assert(idx_next < m_clusterSize);
assert(m_clusters[idx_next] == pNext);
@@ -2641,7 +2747,7 @@
const long long id = ReadUInt(m_pReader, pos, len);
- if (id != 0x0F43B675) // weird: not Cluster ID
+ if (id != mkvmuxer::kMkvCluster)
return -1;
pos += len; // consume ID
@@ -2687,7 +2793,8 @@
// Pos now points to start of payload
pos += size; // consume payload (that is, the current cluster)
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
// By consuming the payload, we are assuming that the curr
// cluster isn't interesting. That is, we don't bother checking
@@ -2755,7 +2862,7 @@
const long long idpos = pos; // absolute
const long long idoff = pos - m_start; // relative
- const long long id = ReadUInt(m_pReader, idpos, len); // absolute
+ const long long id = ReadID(m_pReader, idpos, len); // absolute
if (id < 0) // error
return static_cast<long>(id);
@@ -2805,7 +2912,7 @@
return E_FILE_FORMAT_INVALID;
}
- if (id == 0x0C53BB6B) { // Cues ID
+ if (id == mkvmuxer::kMkvCues) {
if (size == unknown_size)
return E_FILE_FORMAT_INVALID;
@@ -2818,22 +2925,26 @@
const long long element_size = element_stop - element_start;
if (m_pCues == NULL) {
- m_pCues = new Cues(this, pos, size, element_start, element_size);
- assert(m_pCues); // TODO
+ m_pCues = new (std::nothrow)
+ Cues(this, pos, size, element_start, element_size);
+ if (m_pCues == NULL)
+ return false;
}
pos += size; // consume payload
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
continue;
}
- if (id != 0x0F43B675) { // not a Cluster ID
+ if (id != mkvmuxer::kMkvCluster) { // not a Cluster ID
if (size == unknown_size)
return E_FILE_FORMAT_INVALID;
pos += size; // consume payload
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
continue;
}
@@ -2905,12 +3016,15 @@
Cluster* const pNext = Cluster::Create(this,
-1, // preloaded
off_next);
- // element_size);
- assert(pNext);
+ if (pNext == NULL)
+ return -1;
const ptrdiff_t idx_next = i - m_clusters; // insertion position
- PreloadCluster(pNext, idx_next);
+ if (!PreloadCluster(pNext, idx_next)) {
+ delete pNext;
+ return -1;
+ }
assert(m_clusters);
assert(idx_next < m_clusterSize);
assert(m_clusters[idx_next] == pNext);
@@ -2953,7 +3067,7 @@
return E_BUFFER_NOT_FULL;
const long long idpos = pos;
- const long long id = ReadUInt(m_pReader, idpos, len);
+ const long long id = ReadID(m_pReader, idpos, len);
if (id < 0) // error (or underflow)
return static_cast<long>(id);
@@ -2962,10 +3076,7 @@
// that we have exhausted the sub-element's inside the cluster
// whose ID we parsed earlier.
- if (id == 0x0F43B675) // Cluster ID
- break;
-
- if (id == 0x0C53BB6B) // Cues ID
+ if (id == mkvmuxer::kMkvCluster || id == mkvmuxer::kMkvCues)
break;
pos += len; // consume ID (of sub-element)
@@ -3012,7 +3123,8 @@
return E_FILE_FORMAT_INVALID;
pos += size; // consume payload of sub-element
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
} // determine cluster size
cluster_size = pos - payload_pos;
@@ -3022,7 +3134,8 @@
}
pos += cluster_size; // consume payload
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
return 2; // try to find a cluster that follows next
}
@@ -3131,7 +3244,7 @@
if (size == 0) // weird
continue;
- if (id == 0x05B9) { // EditionEntry ID
+ if (id == mkvmuxer::kMkvEditionEntry) {
status = ParseEdition(pos, size);
if (status < 0) // error
@@ -3139,10 +3252,12 @@
}
pos += size;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3242,10 +3357,10 @@
if (status < 0) // error
return status;
- if (size == 0) // weird
+ if (size == 0)
continue;
- if (id == 0x36) { // Atom ID
+ if (id == mkvmuxer::kMkvChapterAtom) {
status = ParseAtom(pReader, pos, size);
if (status < 0) // error
@@ -3253,10 +3368,12 @@
}
pos += size;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3373,20 +3490,20 @@
if (status < 0) // error
return status;
- if (size == 0) // weird
+ if (size == 0) // 0 length payload, skip.
continue;
- if (id == 0x00) { // Display ID
+ if (id == mkvmuxer::kMkvChapterDisplay) {
status = ParseDisplay(pReader, pos, size);
if (status < 0) // error
return status;
- } else if (id == 0x1654) { // StringUID ID
+ } else if (id == mkvmuxer::kMkvChapterStringUID) {
status = UnserializeString(pReader, pos, size, m_string_uid);
if (status < 0) // error
return status;
- } else if (id == 0x33C4) { // UID ID
+ } else if (id == mkvmuxer::kMkvChapterUID) {
long long val;
status = UnserializeInt(pReader, pos, size, val);
@@ -3394,14 +3511,14 @@
return status;
m_uid = static_cast<unsigned long long>(val);
- } else if (id == 0x11) { // TimeStart ID
+ } else if (id == mkvmuxer::kMkvChapterTimeStart) {
const long long val = UnserializeUInt(pReader, pos, size);
if (val < 0) // error
return static_cast<long>(val);
m_start_timecode = val;
- } else if (id == 0x12) { // TimeEnd ID
+ } else if (id == mkvmuxer::kMkvChapterTimeEnd) {
const long long val = UnserializeUInt(pReader, pos, size);
if (val < 0) // error
@@ -3411,10 +3528,12 @@
}
pos += size;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3524,20 +3643,20 @@
if (status < 0) // error
return status;
- if (size == 0) // weird
+ if (size == 0) // No payload.
continue;
- if (id == 0x05) { // ChapterString ID
+ if (id == mkvmuxer::kMkvChapString) {
status = UnserializeString(pReader, pos, size, m_string);
if (status)
return status;
- } else if (id == 0x037C) { // ChapterLanguage ID
+ } else if (id == mkvmuxer::kMkvChapLanguage) {
status = UnserializeString(pReader, pos, size, m_language);
if (status)
return status;
- } else if (id == 0x037E) { // ChapterCountry ID
+ } else if (id == mkvmuxer::kMkvChapCountry) {
status = UnserializeString(pReader, pos, size, m_country);
if (status)
@@ -3545,10 +3664,12 @@
}
pos += size;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3588,7 +3709,7 @@
if (size == 0) // 0 length tag, read another
continue;
- if (id == 0x3373) { // Tag ID
+ if (id == mkvmuxer::kMkvTag) {
status = ParseTag(pos, size);
if (status < 0)
@@ -3596,14 +3717,12 @@
}
pos += size;
- assert(pos <= stop);
if (pos > stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
if (pos != stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3706,7 +3825,7 @@
if (size == 0) // 0 length tag, read another
continue;
- if (id == 0x27C8) { // SimpleTag ID
+ if (id == mkvmuxer::kMkvSimpleTag) {
status = ParseSimpleTag(pReader, pos, size);
if (status < 0)
@@ -3714,14 +3833,12 @@
}
pos += size;
- assert(pos <= stop);
if (pos > stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
if (pos != stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3799,12 +3916,12 @@
if (size == 0) // weird
continue;
- if (id == 0x5A3) { // TagName ID
+ if (id == mkvmuxer::kMkvTagName) {
status = UnserializeString(pReader, pos, size, m_tag_name);
if (status)
return status;
- } else if (id == 0x487) { // TagString ID
+ } else if (id == mkvmuxer::kMkvTagString) {
status = UnserializeString(pReader, pos, size, m_tag_string);
if (status)
@@ -3812,14 +3929,12 @@
}
pos += size;
- assert(pos <= stop);
if (pos > stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
if (pos != stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3866,12 +3981,12 @@
if (status < 0) // error
return status;
- if (id == 0x0AD7B1) { // Timecode Scale
+ if (id == mkvmuxer::kMkvTimecodeScale) {
m_timecodeScale = UnserializeUInt(pReader, pos, size);
if (m_timecodeScale <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0489) { // Segment duration
+ } else if (id == mkvmuxer::kMkvDuration) {
const long status = UnserializeFloat(pReader, pos, size, m_duration);
if (status < 0)
@@ -3879,19 +3994,19 @@
if (m_duration < 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0D80) { // MuxingApp
+ } else if (id == mkvmuxer::kMkvMuxingApp) {
const long status =
UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
if (status)
return status;
- } else if (id == 0x1741) { // WritingApp
+ } else if (id == mkvmuxer::kMkvWritingApp) {
const long status =
UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
if (status)
return status;
- } else if (id == 0x3BA9) { // Title
+ } else if (id == mkvmuxer::kMkvTitle) {
const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
if (status)
@@ -3899,10 +4014,17 @@
}
pos += size;
- assert(pos <= stop);
+
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ const double rollover_check = m_duration * m_timecodeScale;
+ if (rollover_check > LONG_LONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -4039,15 +4161,15 @@
if (status < 0) // error
return status;
- if (id == 0x7E8) {
- // AESSettingsCipherMode
+ if (id == mkvmuxer::kMkvAESSettingsCipherMode) {
aes->cipher_mode = UnserializeUInt(pReader, pos, size);
if (aes->cipher_mode != 1)
return E_FILE_FORMAT_INVALID;
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
return 0;
@@ -4070,14 +4192,15 @@
if (status < 0) // error
return status;
- if (id == 0x1034) // ContentCompression ID
+ if (id == mkvmuxer::kMkvContentCompression)
++compression_count;
- if (id == 0x1035) // ContentEncryption ID
+ if (id == mkvmuxer::kMkvContentEncryption)
++encryption_count;
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
if (compression_count <= 0 && encryption_count <= 0)
@@ -4108,19 +4231,15 @@
if (status < 0) // error
return status;
- if (id == 0x1031) {
- // ContentEncodingOrder
+ if (id == mkvmuxer::kMkvContentEncodingOrder) {
encoding_order_ = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x1032) {
- // ContentEncodingScope
+ } else if (id == mkvmuxer::kMkvContentEncodingScope) {
encoding_scope_ = UnserializeUInt(pReader, pos, size);
if (encoding_scope_ < 1)
return -1;
- } else if (id == 0x1033) {
- // ContentEncodingType
+ } else if (id == mkvmuxer::kMkvContentEncodingType) {
encoding_type_ = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x1034) {
- // ContentCompression ID
+ } else if (id == mkvmuxer::kMkvContentCompression) {
ContentCompression* const compression =
new (std::nothrow) ContentCompression();
if (!compression)
@@ -4132,8 +4251,7 @@
return status;
}
*compression_entries_end_++ = compression;
- } else if (id == 0x1035) {
- // ContentEncryption ID
+ } else if (id == mkvmuxer::kMkvContentEncryption) {
ContentEncryption* const encryption =
new (std::nothrow) ContentEncryption();
if (!encryption)
@@ -4148,10 +4266,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -4172,21 +4292,18 @@
if (status < 0) // error
return status;
- if (id == 0x254) {
- // ContentCompAlgo
+ if (id == mkvmuxer::kMkvContentCompAlgo) {
long long algo = UnserializeUInt(pReader, pos, size);
if (algo < 0)
return E_FILE_FORMAT_INVALID;
compression->algo = algo;
valid = true;
- } else if (id == 0x255) {
- // ContentCompSettings
+ } else if (id == mkvmuxer::kMkvContentCompSettings) {
if (size <= 0)
return E_FILE_FORMAT_INVALID;
const size_t buflen = static_cast<size_t>(size);
- typedef unsigned char* buf_t;
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -4202,7 +4319,8 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
// ContentCompAlgo is mandatory
@@ -4227,13 +4345,11 @@
if (status < 0) // error
return status;
- if (id == 0x7E1) {
- // ContentEncAlgo
+ if (id == mkvmuxer::kMkvContentEncAlgo) {
encryption->algo = UnserializeUInt(pReader, pos, size);
if (encryption->algo != 5)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x7E2) {
- // ContentEncKeyID
+ } else if (id == mkvmuxer::kMkvContentEncKeyID) {
delete[] encryption->key_id;
encryption->key_id = NULL;
encryption->key_id_len = 0;
@@ -4242,8 +4358,7 @@
return E_FILE_FORMAT_INVALID;
const size_t buflen = static_cast<size_t>(size);
- typedef unsigned char* buf_t;
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -4256,8 +4371,7 @@
encryption->key_id = buf;
encryption->key_id_len = buflen;
- } else if (id == 0x7E3) {
- // ContentSignature
+ } else if (id == mkvmuxer::kMkvContentSignature) {
delete[] encryption->signature;
encryption->signature = NULL;
encryption->signature_len = 0;
@@ -4266,8 +4380,7 @@
return E_FILE_FORMAT_INVALID;
const size_t buflen = static_cast<size_t>(size);
- typedef unsigned char* buf_t;
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -4280,8 +4393,7 @@
encryption->signature = buf;
encryption->signature_len = buflen;
- } else if (id == 0x7E4) {
- // ContentSigKeyID
+ } else if (id == mkvmuxer::kMkvContentSigKeyID) {
delete[] encryption->sig_key_id;
encryption->sig_key_id = NULL;
encryption->sig_key_id_len = 0;
@@ -4290,8 +4402,7 @@
return E_FILE_FORMAT_INVALID;
const size_t buflen = static_cast<size_t>(size);
- typedef unsigned char* buf_t;
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -4304,14 +4415,11 @@
encryption->sig_key_id = buf;
encryption->sig_key_id_len = buflen;
- } else if (id == 0x7E5) {
- // ContentSigAlgo
+ } else if (id == mkvmuxer::kMkvContentSigAlgo) {
encryption->sig_algo = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x7E6) {
- // ContentSigHashAlgo
+ } else if (id == mkvmuxer::kMkvContentSigHashAlgo) {
encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x7E7) {
- // ContentEncAESSettings
+ } else if (id == mkvmuxer::kMkvContentEncAESSettings) {
const long status = ParseContentEncAESSettingsEntry(
pos, size, pReader, &encryption->aes_settings);
if (status)
@@ -4319,7 +4427,8 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
return 0;
@@ -4418,7 +4527,7 @@
const size_t len = strlen(src);
- dst = new (std::nothrow) char[len + 1];
+ dst = SafeArrayAlloc<char>(1, len + 1);
if (dst == NULL)
return -1;
@@ -4469,7 +4578,7 @@
if (dst.codecPrivateSize != 0)
return -1;
- dst.codecPrivate = new (std::nothrow) unsigned char[codecPrivateSize];
+ dst.codecPrivate = SafeArrayAlloc<unsigned char>(1, codecPrivateSize);
if (dst.codecPrivate == NULL)
return -1;
@@ -4797,11 +4906,12 @@
return status;
// pos now designates start of element
- if (id == 0x2240) // ContentEncoding ID
+ if (id == mkvmuxer::kMkvContentEncoding)
++count;
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
if (count <= 0)
@@ -4821,7 +4931,7 @@
return status;
// pos now designates start of element
- if (id == 0x2240) { // ContentEncoding ID
+ if (id == mkvmuxer::kMkvContentEncoding) {
ContentEncoding* const content_encoding =
new (std::nothrow) ContentEncoding();
if (!content_encoding)
@@ -4837,10 +4947,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -4892,37 +5004,37 @@
if (status < 0) // error
return status;
- if (id == 0x30) { // pixel width
+ if (id == mkvmuxer::kMkvPixelWidth) {
width = UnserializeUInt(pReader, pos, size);
if (width <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x3A) { // pixel height
+ } else if (id == mkvmuxer::kMkvPixelHeight) {
height = UnserializeUInt(pReader, pos, size);
if (height <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x14B0) { // display width
+ } else if (id == mkvmuxer::kMkvDisplayWidth) {
display_width = UnserializeUInt(pReader, pos, size);
if (display_width <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x14BA) { // display height
+ } else if (id == mkvmuxer::kMkvDisplayHeight) {
display_height = UnserializeUInt(pReader, pos, size);
if (display_height <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x14B2) { // display unit
+ } else if (id == mkvmuxer::kMkvDisplayUnit) {
display_unit = UnserializeUInt(pReader, pos, size);
if (display_unit < 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x13B8) { // stereo mode
+ } else if (id == mkvmuxer::kMkvStereoMode) {
stereo_mode = UnserializeUInt(pReader, pos, size);
if (stereo_mode < 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0383E3) { // frame rate
+ } else if (id == mkvmuxer::kMkvFrameRate) {
const long status = UnserializeFloat(pReader, pos, size, rate);
if (status < 0)
@@ -4933,10 +5045,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
VideoTrack* const pTrack =
new (std::nothrow) VideoTrack(pSegment, element_start, element_size);
@@ -5110,7 +5224,7 @@
if (status < 0) // error
return status;
- if (id == 0x35) { // Sample Rate
+ if (id == mkvmuxer::kMkvSamplingFrequency) {
status = UnserializeFloat(pReader, pos, size, rate);
if (status < 0)
@@ -5118,12 +5232,12 @@
if (rate <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x1F) { // Channel Count
+ } else if (id == mkvmuxer::kMkvChannels) {
channels = UnserializeUInt(pReader, pos, size);
if (channels <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x2264) { // Bit Depth
+ } else if (id == mkvmuxer::kMkvBitDepth) {
bit_depth = UnserializeUInt(pReader, pos, size);
if (bit_depth <= 0)
@@ -5131,10 +5245,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
AudioTrack* const pTrack =
new (std::nothrow) AudioTrack(pSegment, element_start, element_size);
@@ -5194,14 +5310,16 @@
if (size == 0) // weird
continue;
- if (id == 0x2E) // TrackEntry ID
+ if (id == mkvmuxer::kMkvTrackEntry)
++count;
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
if (count <= 0)
return 0; // success
@@ -5234,13 +5352,12 @@
const long long element_size = payload_stop - element_start;
- if (id == 0x2E) { // TrackEntry ID
+ if (id == mkvmuxer::kMkvTrackEntry) {
Track*& pTrack = *m_trackEntriesEnd;
pTrack = NULL;
const long status = ParseTrackEntry(pos, payload_size, element_start,
element_size, pTrack);
-
if (status)
return status;
@@ -5249,10 +5366,12 @@
}
pos = payload_stop;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0; // success
}
@@ -5309,16 +5428,16 @@
const long long start = pos;
- if (id == 0x60) { // VideoSettings ID
+ if (id == mkvmuxer::kMkvVideo) {
v.start = start;
v.size = size;
- } else if (id == 0x61) { // AudioSettings ID
+ } else if (id == mkvmuxer::kMkvAudio) {
a.start = start;
a.size = size;
- } else if (id == 0x2D80) { // ContentEncodings ID
+ } else if (id == mkvmuxer::kMkvContentEncodings) {
e.start = start;
e.size = size;
- } else if (id == 0x33C5) { // Track UID
+ } else if (id == mkvmuxer::kMkvTrackUID) {
if (size > 8)
return E_FILE_FORMAT_INVALID;
@@ -5340,49 +5459,49 @@
++pos_;
}
- } else if (id == 0x57) { // Track Number
+ } else if (id == mkvmuxer::kMkvTrackNumber) {
const long long num = UnserializeUInt(pReader, pos, size);
if ((num <= 0) || (num > 127))
return E_FILE_FORMAT_INVALID;
info.number = static_cast<long>(num);
- } else if (id == 0x03) { // Track Type
+ } else if (id == mkvmuxer::kMkvTrackType) {
const long long type = UnserializeUInt(pReader, pos, size);
if ((type <= 0) || (type > 254))
return E_FILE_FORMAT_INVALID;
info.type = static_cast<long>(type);
- } else if (id == 0x136E) { // Track Name
+ } else if (id == mkvmuxer::kMkvName) {
const long status =
UnserializeString(pReader, pos, size, info.nameAsUTF8);
if (status)
return status;
- } else if (id == 0x02B59C) { // Track Language
+ } else if (id == mkvmuxer::kMkvLanguage) {
const long status = UnserializeString(pReader, pos, size, info.language);
if (status)
return status;
- } else if (id == 0x03E383) { // Default Duration
+ } else if (id == mkvmuxer::kMkvDefaultDuration) {
const long long duration = UnserializeUInt(pReader, pos, size);
if (duration < 0)
return E_FILE_FORMAT_INVALID;
info.defaultDuration = static_cast<unsigned long long>(duration);
- } else if (id == 0x06) { // CodecID
+ } else if (id == mkvmuxer::kMkvCodecID) {
const long status = UnserializeString(pReader, pos, size, info.codecId);
if (status)
return status;
- } else if (id == 0x1C) { // lacing
+ } else if (id == mkvmuxer::kMkvFlagLacing) {
lacing = UnserializeUInt(pReader, pos, size);
if ((lacing < 0) || (lacing > 1))
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x23A2) { // Codec Private
+ } else if (id == mkvmuxer::kMkvCodecPrivate) {
delete[] info.codecPrivate;
info.codecPrivate = NULL;
info.codecPrivateSize = 0;
@@ -5390,9 +5509,7 @@
const size_t buflen = static_cast<size_t>(size);
if (buflen) {
- typedef unsigned char* buf_t;
-
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -5407,23 +5524,25 @@
info.codecPrivate = buf;
info.codecPrivateSize = buflen;
}
- } else if (id == 0x058688) { // Codec Name
+ } else if (id == mkvmuxer::kMkvCodecName) {
const long status =
UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
if (status)
return status;
- } else if (id == 0x16AA) { // Codec Delay
+ } else if (id == mkvmuxer::kMkvCodecDelay) {
info.codecDelay = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x16BB) { // Seek Pre Roll
+ } else if (id == mkvmuxer::kMkvSeekPreRoll) {
info.seekPreRoll = UnserializeUInt(pReader, pos, size);
}
pos += size; // consume payload
- assert(pos <= track_stop);
+ if (pos > track_stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == track_stop);
+ if (pos != track_stop)
+ return E_FILE_FORMAT_INVALID;
if (info.number <= 0) // not specified
return E_FILE_FORMAT_INVALID;
@@ -5552,98 +5671,88 @@
}
long Cluster::Load(long long& pos, long& len) const {
- assert(m_pSegment);
- assert(m_pos >= m_element_start);
+ if (m_pSegment == NULL)
+ return E_PARSE_FAILED;
if (m_timecode >= 0) // at least partially loaded
return 0;
- assert(m_pos == m_element_start);
- assert(m_element_size < 0);
+ if (m_pos != m_element_start || m_element_size >= 0)
+ return E_PARSE_FAILED;
IMkvReader* const pReader = m_pSegment->m_pReader;
-
long long total, avail;
-
const int status = pReader->Length(&total, &avail);
if (status < 0) // error
return status;
- assert((total < 0) || (avail <= total));
- assert((total < 0) || (m_pos <= total)); // TODO: verify this
+ if (total >= 0 && (avail > total || m_pos > total))
+ return E_FILE_FORMAT_INVALID;
pos = m_pos;
long long cluster_size = -1;
- {
- if ((pos + 1) > avail) {
- len = 1;
- return E_BUFFER_NOT_FULL;
- }
-
- long long result = GetUIntLength(pReader, pos, len);
-
- if (result < 0) // error or underflow
- return static_cast<long>(result);
-
- if (result > 0) // underflow (weird)
- return E_BUFFER_NOT_FULL;
-
- // if ((pos + len) > segment_stop)
- // return E_FILE_FORMAT_INVALID;
-
- if ((pos + len) > avail)
- return E_BUFFER_NOT_FULL;
-
- const long long id_ = ReadUInt(pReader, pos, len);
-
- if (id_ < 0) // error
- return static_cast<long>(id_);
-
- if (id_ != 0x0F43B675) // Cluster ID
- return E_FILE_FORMAT_INVALID;
-
- pos += len; // consume id
-
- // read cluster size
-
- if ((pos + 1) > avail) {
- len = 1;
- return E_BUFFER_NOT_FULL;
- }
-
- result = GetUIntLength(pReader, pos, len);
-
- if (result < 0) // error
- return static_cast<long>(result);
-
- if (result > 0) // weird
- return E_BUFFER_NOT_FULL;
-
- // if ((pos + len) > segment_stop)
- // return E_FILE_FORMAT_INVALID;
-
- if ((pos + len) > avail)
- return E_BUFFER_NOT_FULL;
-
- const long long size = ReadUInt(pReader, pos, len);
-
- if (size < 0) // error
- return static_cast<long>(cluster_size);
-
- if (size == 0)
- return E_FILE_FORMAT_INVALID; // TODO: verify this
-
- pos += len; // consume length of size of element
-
- const long long unknown_size = (1LL << (7 * len)) - 1;
-
- if (size != unknown_size)
- cluster_size = size;
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
}
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error or underflow
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id_ = ReadID(pReader, pos, len);
+
+ if (id_ < 0) // error
+ return static_cast<long>(id_);
+
+ if (id_ != mkvmuxer::kMkvCluster)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume id
+
+ // read cluster size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(cluster_size);
+
+ if (size == 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume length of size of element
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size != unknown_size)
+ cluster_size = size;
+
// pos points to start of payload
long long timecode = -1;
long long new_pos = -1;
@@ -5667,7 +5776,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5676,7 +5785,7 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0) // error
return static_cast<long>(id);
@@ -5688,10 +5797,10 @@
// that we have exhausted the sub-element's inside the cluster
// whose ID we parsed earlier.
- if (id == 0x0F43B675) // Cluster ID
+ if (id == mkvmuxer::kMkvCluster)
break;
- if (id == 0x0C53BB6B) // Cues ID
+ if (id == mkvmuxer::kMkvCues)
break;
pos += len; // consume ID field
@@ -5708,7 +5817,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5734,13 +5843,13 @@
// pos now points to start of payload
- if (size == 0) // weird
+ if (size == 0)
continue;
if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
return E_FILE_FORMAT_INVALID;
- if (id == 0x67) { // TimeCode ID
+ if (id == mkvmuxer::kMkvTimecode) {
len = static_cast<long>(size);
if ((pos + size) > avail)
@@ -5755,19 +5864,21 @@
if (bBlock)
break;
- } else if (id == 0x20) { // BlockGroup ID
+ } else if (id == mkvmuxer::kMkvBlockGroup) {
bBlock = true;
break;
- } else if (id == 0x23) { // SimpleBlock ID
+ } else if (id == mkvmuxer::kMkvSimpleBlock) {
bBlock = true;
break;
}
pos += size; // consume payload
- assert((cluster_stop < 0) || (pos <= cluster_stop));
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert((cluster_stop < 0) || (pos <= cluster_stop));
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
if (timecode < 0) // no timecode found
return E_FILE_FORMAT_INVALID;
@@ -5790,10 +5901,8 @@
if (status < 0)
return status;
- assert(m_pos >= m_element_start);
- assert(m_timecode >= 0);
- // assert(m_size > 0);
- // assert(m_element_size > m_size);
+ if (m_pos < m_element_start || m_timecode < 0)
+ return E_PARSE_FAILED;
const long long cluster_stop =
(m_element_size < 0) ? -1 : m_element_start + m_element_size;
@@ -5810,7 +5919,8 @@
if (status < 0) // error
return status;
- assert((total < 0) || (avail <= total));
+ if (total >= 0 && avail > total)
+ return E_FILE_FORMAT_INVALID;
pos = m_pos;
@@ -5837,7 +5947,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5846,19 +5956,16 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
- if (id < 0) // error
- return static_cast<long>(id);
-
- if (id == 0) // weird
+ if (id < 0)
return E_FILE_FORMAT_INVALID;
// This is the distinguished set of ID's we use to determine
// that we have exhausted the sub-element's inside the cluster
// whose ID we parsed earlier.
- if ((id == 0x0F43B675) || (id == 0x0C53BB6B)) { // Cluster or Cues ID
+ if ((id == mkvmuxer::kMkvCluster) || (id == mkvmuxer::kMkvCues)) {
if (m_element_size < 0)
m_element_size = pos - m_element_start;
@@ -5879,7 +5986,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5905,7 +6012,7 @@
// pos now points to start of payload
- if (size == 0) // weird
+ if (size == 0)
continue;
// const long long block_start = pos;
@@ -5913,8 +6020,10 @@
if (cluster_stop >= 0) {
if (block_stop > cluster_stop) {
- if ((id == 0x20) || (id == 0x23))
+ if (id == mkvmuxer::kMkvBlockGroup ||
+ id == mkvmuxer::kMkvSimpleBlock) {
return E_FILE_FORMAT_INVALID;
+ }
pos = cluster_stop;
break;
@@ -5930,42 +6039,48 @@
Cluster* const this_ = const_cast<Cluster*>(this);
- if (id == 0x20) // BlockGroup
+ if (id == mkvmuxer::kMkvBlockGroup)
return this_->ParseBlockGroup(size, pos, len);
- if (id == 0x23) // SimpleBlock
+ if (id == mkvmuxer::kMkvSimpleBlock)
return this_->ParseSimpleBlock(size, pos, len);
pos += size; // consume payload
- assert((cluster_stop < 0) || (pos <= cluster_stop));
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(m_element_size > 0);
+ if (m_element_size < 1)
+ return E_FILE_FORMAT_INVALID;
m_pos = pos;
- assert((cluster_stop < 0) || (m_pos <= cluster_stop));
+ if (cluster_stop >= 0 && m_pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
if (m_entries_count > 0) {
const long idx = m_entries_count - 1;
const BlockEntry* const pLast = m_entries[idx];
- assert(pLast);
+ if (pLast == NULL)
+ return E_PARSE_FAILED;
const Block* const pBlock = pLast->GetBlock();
- assert(pBlock);
+ if (pBlock == NULL)
+ return E_PARSE_FAILED;
const long long start = pBlock->m_start;
if ((total >= 0) && (start > total))
- return -1; // defend against trucated stream
+ return E_PARSE_FAILED; // defend against trucated stream
const long long size = pBlock->m_size;
const long long stop = start + size;
- assert((cluster_stop < 0) || (stop <= cluster_stop));
+ if (cluster_stop >= 0 && stop > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
if ((total >= 0) && (stop > total))
- return -1; // defend against trucated stream
+ return E_PARSE_FAILED; // defend against trucated stream
}
return 1; // no more entries
@@ -6058,7 +6173,7 @@
return E_BUFFER_NOT_FULL;
}
- status = CreateBlock(0x23, // simple block id
+ status = CreateBlock(mkvmuxer::kMkvSimpleBlock,
block_start, block_size,
0); // DiscardPadding
@@ -6118,12 +6233,12 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0) // error
return static_cast<long>(id);
- if (id == 0) // not a value ID
+ if (id == 0) // not a valid ID
return E_FILE_FORMAT_INVALID;
pos += len; // consume ID field
@@ -6169,14 +6284,14 @@
if (size == unknown_size)
return E_FILE_FORMAT_INVALID;
- if (id == 0x35A2) { // DiscardPadding
+ if (id == mkvmuxer::kMkvDiscardPadding) {
status = UnserializeInt(pReader, pos, size, discard_padding);
if (status < 0) // error
return status;
}
- if (id != 0x21) { // sub-part of BlockGroup is not a Block
+ if (id != mkvmuxer::kMkvBlock) {
pos += size; // consume sub-part of block group
if (pos > payload_stop)
@@ -6262,12 +6377,14 @@
}
pos = block_stop; // consume block-part of block group
- assert(pos <= payload_stop);
+ if (pos > payload_stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == payload_stop);
+ if (pos != payload_stop)
+ return E_FILE_FORMAT_INVALID;
- status = CreateBlock(0x20, // BlockGroup ID
+ status = CreateBlock(mkvmuxer::kMkvBlockGroup,
payload_start, payload_size, discard_padding);
if (status != 0)
return status;
@@ -6310,17 +6427,14 @@
return E_BUFFER_NOT_FULL; // underflow, since more remains to be parsed
}
-Cluster* Cluster::Create(Segment* pSegment, long idx, long long off)
-// long long element_size)
-{
- assert(pSegment);
- assert(off >= 0);
+Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) {
+ if (!pSegment || off < 0)
+ return NULL;
const long long element_start = pSegment->m_start + off;
- Cluster* const pCluster = new Cluster(pSegment, idx, element_start);
- // element_size);
- assert(pCluster);
+ Cluster* const pCluster =
+ new (std::nothrow) Cluster(pSegment, idx, element_start);
return pCluster;
}
@@ -6431,13 +6545,13 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0) // error
return static_cast<long>(id);
- if (id != 0x0F43B675) // weird: not cluster ID
- return -1; // generic error
+ if (id != mkvmuxer::kMkvCluster)
+ return E_PARSE_FAILED;
pos += len; // consume Cluster ID field
@@ -6515,7 +6629,7 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0) // error
return static_cast<long>(id);
@@ -6524,10 +6638,10 @@
// that we have exhausted the sub-element's inside the cluster
// whose ID we parsed earlier.
- if (id == 0x0F43B675) // Cluster ID
+ if (id == mkvmuxer::kMkvCluster)
return 0; // no entries found
- if (id == 0x0C53BB6B) // Cues ID
+ if (id == mkvmuxer::kMkvCues)
return 0; // no entries found
pos += len; // consume id field
@@ -6579,14 +6693,15 @@
if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
return E_FILE_FORMAT_INVALID;
- if (id == 0x20) // BlockGroup ID
+ if (id == mkvmuxer::kMkvBlockGroup)
return 1; // have at least one entry
- if (id == 0x23) // SimpleBlock ID
+ if (id == mkvmuxer::kMkvSimpleBlock)
return 1; // have at least one entry
pos += size; // consume payload
- assert((cluster_stop < 0) || (pos <= cluster_stop));
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
}
}
@@ -6656,14 +6771,17 @@
long Cluster::CreateBlock(long long id,
long long pos, // absolute pos of payload
long long size, long long discard_padding) {
- assert((id == 0x20) || (id == 0x23)); // BlockGroup or SimpleBlock
+ if (id != mkvmuxer::kMkvBlockGroup && id != mkvmuxer::kMkvSimpleBlock)
+ return E_PARSE_FAILED;
if (m_entries_count < 0) { // haven't parsed anything yet
assert(m_entries == NULL);
assert(m_entries_size == 0);
m_entries_size = 1024;
- m_entries = new BlockEntry*[m_entries_size];
+ m_entries = new (std::nothrow) BlockEntry*[m_entries_size];
+ if (m_entries == NULL)
+ return -1;
m_entries_count = 0;
} else {
@@ -6674,8 +6792,9 @@
if (m_entries_count >= m_entries_size) {
const long entries_size = 2 * m_entries_size;
- BlockEntry** const entries = new BlockEntry*[entries_size];
- assert(entries);
+ BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size];
+ if (entries == NULL)
+ return -1;
BlockEntry** src = m_entries;
BlockEntry** const src_end = src + m_entries_count;
@@ -6692,9 +6811,9 @@
}
}
- if (id == 0x20) // BlockGroup ID
+ if (id == mkvmuxer::kMkvBlockGroup)
return CreateBlockGroup(pos, size, discard_padding);
- else // SimpleBlock ID
+ else
return CreateSimpleBlock(pos, size);
}
@@ -6725,9 +6844,9 @@
while (pos < stop) {
long len;
- const long long id = ReadUInt(pReader, pos, len);
- assert(id >= 0); // TODO
- assert((pos + len) <= stop);
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (pos + len) > stop)
+ return E_FILE_FORMAT_INVALID;
pos += len; // consume ID
@@ -6737,12 +6856,12 @@
pos += len; // consume size
- if (id == 0x21) { // Block ID
+ if (id == mkvmuxer::kMkvBlock) {
if (bpos < 0) { // Block ID
bpos = pos;
bsize = size;
}
- } else if (id == 0x1B) { // Duration ID
+ } else if (id == mkvmuxer::kMkvBlockDuration) {
if (size > 8)
return E_FILE_FORMAT_INVALID;
@@ -6750,7 +6869,7 @@
if (duration < 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x7B) { // ReferenceBlock
+ } else if (id == mkvmuxer::kMkvReferenceBlock) {
if (size > 8 || size <= 0)
return E_FILE_FORMAT_INVALID;
const long size_ = static_cast<long>(size);
@@ -6764,17 +6883,19 @@
if (time <= 0) // see note above
prev = time;
- else // weird
+ else
next = time;
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
if (bpos < 0)
return E_FILE_FORMAT_INVALID;
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
assert(bsize >= 0);
const long idx = m_entries_count;
@@ -7213,7 +7334,9 @@
return E_FILE_FORMAT_INVALID;
m_frame_count = 1;
- m_frames = new Frame[m_frame_count];
+ m_frames = new (std::nothrow) Frame[m_frame_count];
+ if (m_frames == NULL)
+ return -1;
Frame& f = m_frames[0];
f.pos = pos;
@@ -7239,18 +7362,23 @@
return E_FILE_FORMAT_INVALID;
++pos; // consume frame count
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
m_frame_count = int(biased_count) + 1;
- m_frames = new Frame[m_frame_count];
- assert(m_frames);
+ m_frames = new (std::nothrow) Frame[m_frame_count];
+ if (m_frames == NULL)
+ return -1;
+
+ if (!m_frames)
+ return E_FILE_FORMAT_INVALID;
if (lacing == 1) { // Xiph
Frame* pf = m_frames;
Frame* const pf_end = pf + m_frame_count;
- long size = 0;
+ long long size = 0;
int frame_count = m_frame_count;
while (frame_count > 1) {
@@ -7277,6 +7405,8 @@
Frame& f = *pf++;
assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
f.pos = 0; // patch later
@@ -7289,8 +7419,8 @@
--frame_count;
}
- assert(pf < pf_end);
- assert(pos <= stop);
+ if (pf >= pf_end || pos > stop)
+ return E_FILE_FORMAT_INVALID;
{
Frame& f = *pf++;
@@ -7318,11 +7448,17 @@
Frame& f = *pf++;
assert((pos + f.len) <= stop);
+ if ((pos + f.len) > stop)
+ return E_FILE_FORMAT_INVALID;
+
f.pos = pos;
pos += f.len;
}
assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
} else if (lacing == 2) { // fixed-size lacing
if (pos >= stop)
return E_FILE_FORMAT_INVALID;
@@ -7342,6 +7478,8 @@
while (pf != pf_end) {
assert((pos + frame_size) <= stop);
+ if ((pos + frame_size) > stop)
+ return E_FILE_FORMAT_INVALID;
Frame& f = *pf++;
@@ -7352,13 +7490,16 @@
}
assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
} else {
assert(lacing == 3); // EBML lacing
if (pos >= stop)
return E_FILE_FORMAT_INVALID;
- long size = 0;
+ long long size = 0;
int frame_count = m_frame_count;
long long frame_size = ReadUInt(pReader, pos, len);
@@ -7396,6 +7537,9 @@
return E_FILE_FORMAT_INVALID;
assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
+
const Frame& prev = *pf++;
assert(prev.len == frame_size);
@@ -7403,6 +7547,8 @@
return E_FILE_FORMAT_INVALID;
assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
Frame& curr = *pf;
@@ -7417,7 +7563,8 @@
return E_FILE_FORMAT_INVALID;
pos += len; // consume length of (delta) size
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
const int exp = 7 * len - 1;
const long long bias = (1LL << exp) - 1LL;
@@ -7439,18 +7586,20 @@
// parse last frame
if (frame_count > 0) {
- assert(pos <= stop);
- assert(pf < pf_end);
+ if (pos > stop || pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
const Frame& prev = *pf++;
assert(prev.len == frame_size);
if (prev.len != frame_size)
return E_FILE_FORMAT_INVALID;
- assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
Frame& curr = *pf++;
- assert(pf == pf_end);
+ if (pf != pf_end)
+ return E_FILE_FORMAT_INVALID;
curr.pos = 0; // patch later
@@ -7471,6 +7620,8 @@
while (pf != pf_end) {
Frame& f = *pf++;
assert((pos + f.len) <= stop);
+ if ((pos + f.len) > stop)
+ return E_FILE_FORMAT_INVALID;
f.pos = pos;
pos += f.len;
diff --git a/third_party/libwebm/mkvparser.hpp b/third_party/libwebm/mkvparser.hpp
index aa0b432..75ef69d 100644
--- a/third_party/libwebm/mkvparser.hpp
+++ b/third_party/libwebm/mkvparser.hpp
@@ -9,12 +9,13 @@
#ifndef MKVPARSER_HPP
#define MKVPARSER_HPP
-#include <cstdlib>
-#include <cstdio>
#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
namespace mkvparser {
+const int E_PARSE_FAILED = -1;
const int E_FILE_FORMAT_INVALID = -2;
const int E_BUFFER_NOT_FULL = -3;
@@ -27,8 +28,11 @@
virtual ~IMkvReader();
};
+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+ unsigned long long element_size);
long long GetUIntLength(IMkvReader*, long long, long&);
long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
long long UnserializeUInt(IMkvReader*, long long pos, long long size);
long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
@@ -833,7 +837,7 @@
private:
bool Init() const;
- void PreloadCuePoint(long&, long long) const;
+ bool PreloadCuePoint(long&, long long) const;
mutable CuePoint** m_cue_points;
mutable long m_count;
@@ -999,8 +1003,8 @@
long DoLoadClusterUnknownSize(long long&, long&);
long DoParseNext(const Cluster*&, long long&, long&);
- void AppendCluster(Cluster*);
- void PreloadCluster(Cluster*, ptrdiff_t);
+ bool AppendCluster(Cluster*);
+ bool PreloadCluster(Cluster*, ptrdiff_t);
// void ParseSeekHead(long long pos, long long size);
// void ParseSeekEntry(long long pos, long long size);
diff --git a/third_party/libwebm/webmids.hpp b/third_party/libwebm/webmids.hpp
index 6874e44..ad4ab57 100644
--- a/third_party/libwebm/webmids.hpp
+++ b/third_party/libwebm/webmids.hpp
@@ -41,6 +41,7 @@
kMkvTimecodeScale = 0x2AD7B1,
kMkvDuration = 0x4489,
kMkvDateUTC = 0x4461,
+ kMkvTitle = 0x7BA9,
kMkvMuxingApp = 0x4D80,
kMkvWritingApp = 0x5741,
// Cluster
@@ -107,9 +108,16 @@
kMkvContentEncodingOrder = 0x5031,
kMkvContentEncodingScope = 0x5032,
kMkvContentEncodingType = 0x5033,
+ kMkvContentCompression = 0x5034,
+ kMkvContentCompAlgo = 0x4254,
+ kMkvContentCompSettings = 0x4255,
kMkvContentEncryption = 0x5035,
kMkvContentEncAlgo = 0x47E1,
kMkvContentEncKeyID = 0x47E2,
+ kMkvContentSignature = 0x47E3,
+ kMkvContentSigKeyID = 0x47E4,
+ kMkvContentSigAlgo = 0x47E5,
+ kMkvContentSigHashAlgo = 0x47E6,
kMkvContentEncAESSettings = 0x47E7,
kMkvAESSettingsCipherMode = 0x47E8,
kMkvAESSettingsCipherInitData = 0x47E9,
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index cdb6ab9..0622a7d 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -463,7 +463,5 @@
memset(cm->prev_mip, 0,
cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
- vp10_zero(cm->ref_frame_sign_bias);
-
cm->frame_context_idx = 0;
}
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index ad849af..7217f6d 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -695,6 +695,13 @@
1023,
};
+const scan_order vp10_default_scan_orders[TX_SIZES] = {
+ {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+ {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+ {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
+ {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+};
+
const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES] = {
{ // TX_4X4
{default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
diff --git a/vp10/common/scan.h b/vp10/common/scan.h
index ada67f4..f5a020f 100644
--- a/vp10/common/scan.h
+++ b/vp10/common/scan.h
@@ -29,6 +29,7 @@
const int16_t *neighbors;
} scan_order;
+extern const scan_order vp10_default_scan_orders[TX_SIZES];
extern const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES];
static INLINE int get_coef_context(const int16_t *neighbors,
diff --git a/vp10/common/vp10_fwd_txfm.c b/vp10/common/vp10_fwd_txfm.c
new file mode 100644
index 0000000..3211cd0
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm.c
@@ -0,0 +1,824 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_fwd_txfm.h"
+
+void vp10_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ tran_low_t intermediate[4 * 4];
+ const int16_t *in_pass0 = input;
+ const tran_low_t *in = NULL;
+ tran_low_t *out = intermediate;
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t input[4]; // canbe16
+ tran_high_t step[4]; // canbe16
+ tran_high_t temp1, temp2; // needs32
+ int i;
+ for (i = 0; i < 4; ++i) {
+ // Load inputs.
+ if (0 == pass) {
+ input[0] = in_pass0[0 * stride] * 16;
+ input[1] = in_pass0[1 * stride] * 16;
+ input[2] = in_pass0[2 * stride] * 16;
+ input[3] = in_pass0[3 * stride] * 16;
+ if (i == 0 && input[0]) {
+ input[0] += 1;
+ }
+ } else {
+ input[0] = in[0 * 4];
+ input[1] = in[1 * 4];
+ input[2] = in[2 * 4];
+ input[3] = in[3 * 4];
+ }
+ // Transform.
+ step[0] = input[0] + input[3];
+ step[1] = input[1] + input[2];
+ step[2] = input[1] - input[2];
+ step[3] = input[0] - input[3];
+ temp1 = (step[0] + step[1]) * cospi_16_64;
+ temp2 = (step[0] - step[1]) * cospi_16_64;
+ out[0] = (tran_low_t)fdct_round_shift(temp1);
+ out[2] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+ temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[3] = (tran_low_t)fdct_round_shift(temp2);
+ // Do next column (which is a transposed row in second/horizontal pass)
+ in_pass0++;
+ in++;
+ out += 4;
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ out = output;
+ }
+
+ {
+ int i, j;
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+ }
+ }
+}
+
+void vp10_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 4; ++r)
+ for (c = 0; c < 4; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum << 1;
+ output[1] = 0;
+}
+
+void vp10_fdct8x8_c(const int16_t *input,
+ tran_low_t *final_output, int stride) {
+ int i, j;
+ tran_low_t intermediate[64];
+ int pass;
+ tran_low_t *output = intermediate;
+ const tran_low_t *in = NULL;
+
+ // Transform columns
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ int i;
+ for (i = 0; i < 8; i++) {
+ // stage 1
+ if (pass == 0) {
+ s0 = (input[0 * stride] + input[7 * stride]) * 4;
+ s1 = (input[1 * stride] + input[6 * stride]) * 4;
+ s2 = (input[2 * stride] + input[5 * stride]) * 4;
+ s3 = (input[3 * stride] + input[4 * stride]) * 4;
+ s4 = (input[3 * stride] - input[4 * stride]) * 4;
+ s5 = (input[2 * stride] - input[5 * stride]) * 4;
+ s6 = (input[1 * stride] - input[6 * stride]) * 4;
+ s7 = (input[0 * stride] - input[7 * stride]) * 4;
+ ++input;
+ } else {
+ s0 = in[0 * 8] + in[7 * 8];
+ s1 = in[1 * 8] + in[6 * 8];
+ s2 = in[2 * 8] + in[5 * 8];
+ s3 = in[3 * 8] + in[4 * 8];
+ s4 = in[3 * 8] - in[4 * 8];
+ s5 = in[2 * 8] - in[5 * 8];
+ s6 = in[1 * 8] - in[6 * 8];
+ s7 = in[0 * 8] - in[7 * 8];
+ ++in;
+ }
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ output[0] = (tran_low_t)fdct_round_shift(t0);
+ output[2] = (tran_low_t)fdct_round_shift(t2);
+ output[4] = (tran_low_t)fdct_round_shift(t1);
+ output[6] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ output[1] = (tran_low_t)fdct_round_shift(t0);
+ output[3] = (tran_low_t)fdct_round_shift(t2);
+ output[5] = (tran_low_t)fdct_round_shift(t1);
+ output[7] = (tran_low_t)fdct_round_shift(t3);
+ output += 8;
+ }
+ in = intermediate;
+ output = final_output;
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ final_output[j + i * 8] /= 2;
+ }
+}
+
+void vp10_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 8; ++r)
+ for (c = 0; c < 8; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum;
+ output[1] = 0;
+}
+
+void vp10_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ tran_low_t intermediate[256];
+ const int16_t *in_pass0 = input;
+ const tran_low_t *in = NULL;
+ tran_low_t *out = intermediate;
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t step1[8]; // canbe16
+ tran_high_t step2[8]; // canbe16
+ tran_high_t step3[8]; // canbe16
+ tran_high_t input[8]; // canbe16
+ tran_high_t temp1, temp2; // needs32
+ int i;
+ for (i = 0; i < 16; i++) {
+ if (0 == pass) {
+ // Calculate input for the first 8 results.
+ input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+ input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+ input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+ input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+ input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+ input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+ input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+ input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
+ // Calculate input for the next 8 results.
+ step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+ step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+ step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+ step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+ step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+ step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+ step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+ step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+ } else {
+ // Calculate input for the first 8 results.
+ input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+ input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+ input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+ input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+ input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+ input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+ input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+ input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+ // Calculate input for the next 8 results.
+ step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+ step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+ step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+ step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+ step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+ step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+ step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+ step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+ }
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ // stage 1
+ s0 = input[0] + input[7];
+ s1 = input[1] + input[6];
+ s2 = input[2] + input[5];
+ s3 = input[3] + input[4];
+ s4 = input[3] - input[4];
+ s5 = input[2] - input[5];
+ s6 = input[1] - input[6];
+ s7 = input[0] - input[7];
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
+ t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+ out[0] = (tran_low_t)fdct_round_shift(t0);
+ out[4] = (tran_low_t)fdct_round_shift(t2);
+ out[8] = (tran_low_t)fdct_round_shift(t1);
+ out[12] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ out[2] = (tran_low_t)fdct_round_shift(t0);
+ out[6] = (tran_low_t)fdct_round_shift(t2);
+ out[10] = (tran_low_t)fdct_round_shift(t1);
+ out[14] = (tran_low_t)fdct_round_shift(t3);
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ temp1 = (step1[5] - step1[2]) * cospi_16_64;
+ temp2 = (step1[4] - step1[3]) * cospi_16_64;
+ step2[2] = fdct_round_shift(temp1);
+ step2[3] = fdct_round_shift(temp2);
+ temp1 = (step1[4] + step1[3]) * cospi_16_64;
+ temp2 = (step1[5] + step1[2]) * cospi_16_64;
+ step2[4] = fdct_round_shift(temp1);
+ step2[5] = fdct_round_shift(temp2);
+ // step 3
+ step3[0] = step1[0] + step2[3];
+ step3[1] = step1[1] + step2[2];
+ step3[2] = step1[1] - step2[2];
+ step3[3] = step1[0] - step2[3];
+ step3[4] = step1[7] - step2[4];
+ step3[5] = step1[6] - step2[5];
+ step3[6] = step1[6] + step2[5];
+ step3[7] = step1[7] + step2[4];
+ // step 4
+ temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+ temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
+ step2[1] = fdct_round_shift(temp1);
+ step2[2] = fdct_round_shift(temp2);
+ temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+ temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
+ step2[5] = fdct_round_shift(temp1);
+ step2[6] = fdct_round_shift(temp2);
+ // step 5
+ step1[0] = step3[0] + step2[1];
+ step1[1] = step3[0] - step2[1];
+ step1[2] = step3[3] + step2[2];
+ step1[3] = step3[3] - step2[2];
+ step1[4] = step3[4] - step2[5];
+ step1[5] = step3[4] + step2[5];
+ step1[6] = step3[7] - step2[6];
+ step1[7] = step3[7] + step2[6];
+ // step 6
+ temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+ temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[9] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+ temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+ out[5] = (tran_low_t)fdct_round_shift(temp1);
+ out[13] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+ temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+ out[3] = (tran_low_t)fdct_round_shift(temp1);
+ out[11] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+ temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+ out[7] = (tran_low_t)fdct_round_shift(temp1);
+ out[15] = (tran_low_t)fdct_round_shift(temp2);
+ }
+ // Do next column (which is a transposed row in second/horizontal pass)
+ in++;
+ in_pass0++;
+ out += 16;
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ out = output;
+ }
+}
+
+void vp10_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 16; ++r)
+ for (c = 0; c < 16; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum >> 1;
+ output[1] = 0;
+}
+
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+ // and make the bounds consts.
+ // assert(-131072 <= rv && rv <= 131071);
+ return rv;
+}
+
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+ tran_high_t rv = (input + 1 + (input < 0)) >> 2;
+ return rv;
+}
+
+void vp10_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+ tran_high_t step[32];
+ // Stage 1
+ step[0] = input[0] + input[(32 - 1)];
+ step[1] = input[1] + input[(32 - 2)];
+ step[2] = input[2] + input[(32 - 3)];
+ step[3] = input[3] + input[(32 - 4)];
+ step[4] = input[4] + input[(32 - 5)];
+ step[5] = input[5] + input[(32 - 6)];
+ step[6] = input[6] + input[(32 - 7)];
+ step[7] = input[7] + input[(32 - 8)];
+ step[8] = input[8] + input[(32 - 9)];
+ step[9] = input[9] + input[(32 - 10)];
+ step[10] = input[10] + input[(32 - 11)];
+ step[11] = input[11] + input[(32 - 12)];
+ step[12] = input[12] + input[(32 - 13)];
+ step[13] = input[13] + input[(32 - 14)];
+ step[14] = input[14] + input[(32 - 15)];
+ step[15] = input[15] + input[(32 - 16)];
+ step[16] = -input[16] + input[(32 - 17)];
+ step[17] = -input[17] + input[(32 - 18)];
+ step[18] = -input[18] + input[(32 - 19)];
+ step[19] = -input[19] + input[(32 - 20)];
+ step[20] = -input[20] + input[(32 - 21)];
+ step[21] = -input[21] + input[(32 - 22)];
+ step[22] = -input[22] + input[(32 - 23)];
+ step[23] = -input[23] + input[(32 - 24)];
+ step[24] = -input[24] + input[(32 - 25)];
+ step[25] = -input[25] + input[(32 - 26)];
+ step[26] = -input[26] + input[(32 - 27)];
+ step[27] = -input[27] + input[(32 - 28)];
+ step[28] = -input[28] + input[(32 - 29)];
+ step[29] = -input[29] + input[(32 - 30)];
+ step[30] = -input[30] + input[(32 - 31)];
+ step[31] = -input[31] + input[(32 - 32)];
+
+ // Stage 2
+ output[0] = step[0] + step[16 - 1];
+ output[1] = step[1] + step[16 - 2];
+ output[2] = step[2] + step[16 - 3];
+ output[3] = step[3] + step[16 - 4];
+ output[4] = step[4] + step[16 - 5];
+ output[5] = step[5] + step[16 - 6];
+ output[6] = step[6] + step[16 - 7];
+ output[7] = step[7] + step[16 - 8];
+ output[8] = -step[8] + step[16 - 9];
+ output[9] = -step[9] + step[16 - 10];
+ output[10] = -step[10] + step[16 - 11];
+ output[11] = -step[11] + step[16 - 12];
+ output[12] = -step[12] + step[16 - 13];
+ output[13] = -step[13] + step[16 - 14];
+ output[14] = -step[14] + step[16 - 15];
+ output[15] = -step[15] + step[16 - 16];
+
+ output[16] = step[16];
+ output[17] = step[17];
+ output[18] = step[18];
+ output[19] = step[19];
+
+ output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+ output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+ output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+ output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+ output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+ output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+ output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+ output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+ output[28] = step[28];
+ output[29] = step[29];
+ output[30] = step[30];
+ output[31] = step[31];
+
+ // dump the magnitude by 4, hence the intermediate values are within
+ // the range of 16 bits.
+ if (round) {
+ output[0] = half_round_shift(output[0]);
+ output[1] = half_round_shift(output[1]);
+ output[2] = half_round_shift(output[2]);
+ output[3] = half_round_shift(output[3]);
+ output[4] = half_round_shift(output[4]);
+ output[5] = half_round_shift(output[5]);
+ output[6] = half_round_shift(output[6]);
+ output[7] = half_round_shift(output[7]);
+ output[8] = half_round_shift(output[8]);
+ output[9] = half_round_shift(output[9]);
+ output[10] = half_round_shift(output[10]);
+ output[11] = half_round_shift(output[11]);
+ output[12] = half_round_shift(output[12]);
+ output[13] = half_round_shift(output[13]);
+ output[14] = half_round_shift(output[14]);
+ output[15] = half_round_shift(output[15]);
+
+ output[16] = half_round_shift(output[16]);
+ output[17] = half_round_shift(output[17]);
+ output[18] = half_round_shift(output[18]);
+ output[19] = half_round_shift(output[19]);
+ output[20] = half_round_shift(output[20]);
+ output[21] = half_round_shift(output[21]);
+ output[22] = half_round_shift(output[22]);
+ output[23] = half_round_shift(output[23]);
+ output[24] = half_round_shift(output[24]);
+ output[25] = half_round_shift(output[25]);
+ output[26] = half_round_shift(output[26]);
+ output[27] = half_round_shift(output[27]);
+ output[28] = half_round_shift(output[28]);
+ output[29] = half_round_shift(output[29]);
+ output[30] = half_round_shift(output[30]);
+ output[31] = half_round_shift(output[31]);
+ }
+
+ // Stage 3
+ step[0] = output[0] + output[(8 - 1)];
+ step[1] = output[1] + output[(8 - 2)];
+ step[2] = output[2] + output[(8 - 3)];
+ step[3] = output[3] + output[(8 - 4)];
+ step[4] = -output[4] + output[(8 - 5)];
+ step[5] = -output[5] + output[(8 - 6)];
+ step[6] = -output[6] + output[(8 - 7)];
+ step[7] = -output[7] + output[(8 - 8)];
+ step[8] = output[8];
+ step[9] = output[9];
+ step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+ step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+ step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+ step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+ step[14] = output[14];
+ step[15] = output[15];
+
+ step[16] = output[16] + output[23];
+ step[17] = output[17] + output[22];
+ step[18] = output[18] + output[21];
+ step[19] = output[19] + output[20];
+ step[20] = -output[20] + output[19];
+ step[21] = -output[21] + output[18];
+ step[22] = -output[22] + output[17];
+ step[23] = -output[23] + output[16];
+ step[24] = -output[24] + output[31];
+ step[25] = -output[25] + output[30];
+ step[26] = -output[26] + output[29];
+ step[27] = -output[27] + output[28];
+ step[28] = output[28] + output[27];
+ step[29] = output[29] + output[26];
+ step[30] = output[30] + output[25];
+ step[31] = output[31] + output[24];
+
+ // Stage 4
+ output[0] = step[0] + step[3];
+ output[1] = step[1] + step[2];
+ output[2] = -step[2] + step[1];
+ output[3] = -step[3] + step[0];
+ output[4] = step[4];
+ output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+ output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+ output[7] = step[7];
+ output[8] = step[8] + step[11];
+ output[9] = step[9] + step[10];
+ output[10] = -step[10] + step[9];
+ output[11] = -step[11] + step[8];
+ output[12] = -step[12] + step[15];
+ output[13] = -step[13] + step[14];
+ output[14] = step[14] + step[13];
+ output[15] = step[15] + step[12];
+
+ output[16] = step[16];
+ output[17] = step[17];
+ output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+ output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+ output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+ output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+ output[22] = step[22];
+ output[23] = step[23];
+ output[24] = step[24];
+ output[25] = step[25];
+ output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+ output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+ output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+ output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+ output[30] = step[30];
+ output[31] = step[31];
+
+ // Stage 5
+ step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+ step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+ step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+ step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+ step[4] = output[4] + output[5];
+ step[5] = -output[5] + output[4];
+ step[6] = -output[6] + output[7];
+ step[7] = output[7] + output[6];
+ step[8] = output[8];
+ step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+ step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+ step[11] = output[11];
+ step[12] = output[12];
+ step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+ step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+ step[15] = output[15];
+
+ step[16] = output[16] + output[19];
+ step[17] = output[17] + output[18];
+ step[18] = -output[18] + output[17];
+ step[19] = -output[19] + output[16];
+ step[20] = -output[20] + output[23];
+ step[21] = -output[21] + output[22];
+ step[22] = output[22] + output[21];
+ step[23] = output[23] + output[20];
+ step[24] = output[24] + output[27];
+ step[25] = output[25] + output[26];
+ step[26] = -output[26] + output[25];
+ step[27] = -output[27] + output[24];
+ step[28] = -output[28] + output[31];
+ step[29] = -output[29] + output[30];
+ step[30] = output[30] + output[29];
+ step[31] = output[31] + output[28];
+
+ // Stage 6
+ output[0] = step[0];
+ output[1] = step[1];
+ output[2] = step[2];
+ output[3] = step[3];
+ output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+ output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+ output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+ output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+ output[8] = step[8] + step[9];
+ output[9] = -step[9] + step[8];
+ output[10] = -step[10] + step[11];
+ output[11] = step[11] + step[10];
+ output[12] = step[12] + step[13];
+ output[13] = -step[13] + step[12];
+ output[14] = -step[14] + step[15];
+ output[15] = step[15] + step[14];
+
+ output[16] = step[16];
+ output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+ output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+ output[19] = step[19];
+ output[20] = step[20];
+ output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+ output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+ output[23] = step[23];
+ output[24] = step[24];
+ output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+ output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+ output[27] = step[27];
+ output[28] = step[28];
+ output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+ output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+ output[31] = step[31];
+
+ // Stage 7
+ step[0] = output[0];
+ step[1] = output[1];
+ step[2] = output[2];
+ step[3] = output[3];
+ step[4] = output[4];
+ step[5] = output[5];
+ step[6] = output[6];
+ step[7] = output[7];
+ step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+ step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+ step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+ step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+ step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+ step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+ step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+ step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+ step[16] = output[16] + output[17];
+ step[17] = -output[17] + output[16];
+ step[18] = -output[18] + output[19];
+ step[19] = output[19] + output[18];
+ step[20] = output[20] + output[21];
+ step[21] = -output[21] + output[20];
+ step[22] = -output[22] + output[23];
+ step[23] = output[23] + output[22];
+ step[24] = output[24] + output[25];
+ step[25] = -output[25] + output[24];
+ step[26] = -output[26] + output[27];
+ step[27] = output[27] + output[26];
+ step[28] = output[28] + output[29];
+ step[29] = -output[29] + output[28];
+ step[30] = -output[30] + output[31];
+ step[31] = output[31] + output[30];
+
+ // Final stage --- outputs indices are bit-reversed.
+ output[0] = step[0];
+ output[16] = step[1];
+ output[8] = step[2];
+ output[24] = step[3];
+ output[4] = step[4];
+ output[20] = step[5];
+ output[12] = step[6];
+ output[28] = step[7];
+ output[2] = step[8];
+ output[18] = step[9];
+ output[10] = step[10];
+ output[26] = step[11];
+ output[6] = step[12];
+ output[22] = step[13];
+ output[14] = step[14];
+ output[30] = step[15];
+
+ output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+ output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+ output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+ output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+ output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+ output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+ output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+ output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+ output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+ output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+ output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+ output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+ output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+ output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+ output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+ output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+}
+
+void vp10_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+ int i, j;
+ tran_high_t output[32 * 32];
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = input[j * stride + i] * 4;
+ vp10_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = output[j + i * 32];
+ vp10_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+
+// Note that although we use dct_32_round in dct32 computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vp10_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+ int i, j;
+ tran_high_t output[32 * 32];
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = input[j * stride + i] * 4;
+ vp10_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ // TODO(cd): see quality impact of only doing
+ // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+ // PS: also change code in vp10_dsp/x86/vp10_dct_sse2.c
+ output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = output[j + i * 32];
+ vp10_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] = (tran_low_t)temp_out[j];
+ }
+}
+
+void vp10_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 32; ++r)
+ for (c = 0; c < 32; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum >> 3;
+ output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp10_fdct4x4_c(input, output, stride);
+}
+
+void vp10_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ vp10_fdct8x8_c(input, final_output, stride);
+}
+
+void vp10_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ vp10_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vp10_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp10_fdct16x16_c(input, output, stride);
+}
+
+void vp10_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp10_fdct16x16_1_c(input, output, stride);
+}
+
+void vp10_highbd_fdct32x32_c(const int16_t *input,
+ tran_low_t *out, int stride) {
+ vp10_fdct32x32_c(input, out, stride);
+}
+
+void vp10_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+ int stride) {
+ vp10_fdct32x32_rd_c(input, out, stride);
+}
+
+void vp10_highbd_fdct32x32_1_c(const int16_t *input,
+ tran_low_t *out, int stride) {
+ vp10_fdct32x32_1_c(input, out, stride);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/vp10_fwd_txfm.h b/vp10/common/vp10_fwd_txfm.h
new file mode 100644
index 0000000..46dbf3d
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_VP10_FWD_TXFM_H_
+#define VP10_COMMON_VP10_FWD_TXFM_H_
+
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+void vp10_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+#endif // VP10_COMMON_VP10_FWD_TXFM_H_
diff --git a/vp10/common/vp10_inv_txfm.c b/vp10/common/vp10_inv_txfm.c
new file mode 100644
index 0000000..403b209
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm.c
@@ -0,0 +1,2499 @@
+/*
+ *
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "vp10/common/vp10_inv_txfm.h"
+
+void vp10_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WRAPLOW(a1, 8);
+ op[1] = WRAPLOW(b1, 8);
+ op[2] = WRAPLOW(c1, 8);
+ op[3] = WRAPLOW(d1, 8);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+ dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
+ dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
+ dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+
+ ip++;
+ dest++;
+ }
+}
+
+void vp10_iwht4x4_1_add_c(const tran_low_t *in,
+ uint8_t *dest,
+ int dest_stride) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WRAPLOW(a1, 8);
+ op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
+ dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
+ dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
+ dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+ ip++;
+ dest++;
+ }
+}
+
+void vp10_idct4_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ // stage 1
+ temp1 = (input[0] + input[2]) * cospi_16_64;
+ temp2 = (input[0] - input[2]) * cospi_16_64;
+ step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ // stage 2
+ output[0] = WRAPLOW(step[0] + step[3], 8);
+ output[1] = WRAPLOW(step[1] + step[2], 8);
+ output[2] = WRAPLOW(step[1] - step[2], 8);
+ output[3] = WRAPLOW(step[0] - step[3], 8);
+}
+
+void vp10_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ vp10_idct4_c(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ vp10_idct4_c(temp_in, temp_out);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 4));
+ }
+ }
+}
+
+void vp10_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+ int dest_stride) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = clip_pixel_add(dest[0], a1);
+ dest[1] = clip_pixel_add(dest[1], a1);
+ dest[2] = clip_pixel_add(dest[2], a1);
+ dest[3] = clip_pixel_add(dest[3], a1);
+ dest += dest_stride;
+ }
+}
+
+void vp10_idct8_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ // stage 2
+ temp1 = (step1[0] + step1[2]) * cospi_16_64;
+ temp2 = (step1[0] - step1[2]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+ step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+ step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+ // stage 3
+ step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+ step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+ step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+ step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WRAPLOW(step1[0] + step1[7], 8);
+ output[1] = WRAPLOW(step1[1] + step1[6], 8);
+ output[2] = WRAPLOW(step1[2] + step1[5], 8);
+ output[3] = WRAPLOW(step1[3] + step1[4], 8);
+ output[4] = WRAPLOW(step1[3] - step1[4], 8);
+ output[5] = WRAPLOW(step1[2] - step1[5], 8);
+ output[6] = WRAPLOW(step1[1] - step1[6], 8);
+ output[7] = WRAPLOW(step1[0] - step1[7], 8);
+}
+
+void vp10_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ for (i = 0; i < 8; ++i) {
+ vp10_idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
+void vp10_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i)
+ dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
+ output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
+ output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
+ output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+}
+
+void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
+ s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
+ s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+ s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+ s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+ s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+ s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
+ s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
+ x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
+ x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+
+ // stage 2
+ s0 = (int)x0;
+ s1 = (int)x1;
+ s2 = (int)x2;
+ s3 = (int)x3;
+ s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+ s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+ s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+ s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+
+ x0 = WRAPLOW(s0 + s2, 8);
+ x1 = WRAPLOW(s1 + s3, 8);
+ x2 = WRAPLOW(s0 - s2, 8);
+ x3 = WRAPLOW(s1 - s3, 8);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+
+ // stage 3
+ s2 = (int)(cospi_16_64 * (x2 + x3));
+ s3 = (int)(cospi_16_64 * (x2 - x3));
+ s6 = (int)(cospi_16_64 * (x6 + x7));
+ s7 = (int)(cospi_16_64 * (x6 - x7));
+
+ x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+ x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+
+ output[0] = WRAPLOW(x0, 8);
+ output[1] = WRAPLOW(-x4, 8);
+ output[2] = WRAPLOW(x6, 8);
+ output[3] = WRAPLOW(-x2, 8);
+ output[4] = WRAPLOW(x3, 8);
+ output[5] = WRAPLOW(-x7, 8);
+ output[6] = WRAPLOW(x5, 8);
+ output[7] = WRAPLOW(-x1, 8);
+}
+
+void vp10_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ // only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i) {
+ vp10_idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
+void vp10_idct16_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = input[0/2];
+ step1[1] = input[16/2];
+ step1[2] = input[8/2];
+ step1[3] = input[24/2];
+ step1[4] = input[4/2];
+ step1[5] = input[20/2];
+ step1[6] = input[12/2];
+ step1[7] = input[28/2];
+ step1[8] = input[2/2];
+ step1[9] = input[18/2];
+ step1[10] = input[10/2];
+ step1[11] = input[26/2];
+ step1[12] = input[6/2];
+ step1[13] = input[22/2];
+ step1[14] = input[14/2];
+ step1[15] = input[30/2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+ step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+ step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+ step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+ step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+ step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+ step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+ step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+ step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+ step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+ step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+ step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+ step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+ step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+ step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+ step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+ step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+ step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+ step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+ step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+ step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+ step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+ step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+ step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+ step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+ step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+ step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = WRAPLOW(step2[0] + step2[15], 8);
+ output[1] = WRAPLOW(step2[1] + step2[14], 8);
+ output[2] = WRAPLOW(step2[2] + step2[13], 8);
+ output[3] = WRAPLOW(step2[3] + step2[12], 8);
+ output[4] = WRAPLOW(step2[4] + step2[11], 8);
+ output[5] = WRAPLOW(step2[5] + step2[10], 8);
+ output[6] = WRAPLOW(step2[6] + step2[9], 8);
+ output[7] = WRAPLOW(step2[7] + step2[8], 8);
+ output[8] = WRAPLOW(step2[7] - step2[8], 8);
+ output[9] = WRAPLOW(step2[6] - step2[9], 8);
+ output[10] = WRAPLOW(step2[5] - step2[10], 8);
+ output[11] = WRAPLOW(step2[4] - step2[11], 8);
+ output[12] = WRAPLOW(step2[3] - step2[12], 8);
+ output[13] = WRAPLOW(step2[2] - step2[13], 8);
+ output[14] = WRAPLOW(step2[1] - step2[14], 8);
+ output[15] = WRAPLOW(step2[0] - step2[15], 8);
+}
+
+void vp10_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows
+ for (i = 0; i < 16; ++i) {
+ vp10_idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp10_idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = output[8]
+ = output[9] = output[10] = output[11] = output[12]
+ = output[13] = output[14] = output[15] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
+ x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
+ x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
+ x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
+ x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
+ x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
+ x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
+ x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
+ x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4, 8);
+ x1 = WRAPLOW(s1 + s5, 8);
+ x2 = WRAPLOW(s2 + s6, 8);
+ x3 = WRAPLOW(s3 + s7, 8);
+ x4 = WRAPLOW(s0 - s4, 8);
+ x5 = WRAPLOW(s1 - s5, 8);
+ x6 = WRAPLOW(s2 - s6, 8);
+ x7 = WRAPLOW(s3 - s7, 8);
+ x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
+ x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
+ x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
+ x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
+ x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
+ x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
+ x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
+ x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = WRAPLOW(check_range(s0 + s2), 8);
+ x1 = WRAPLOW(check_range(s1 + s3), 8);
+ x2 = WRAPLOW(check_range(s0 - s2), 8);
+ x3 = WRAPLOW(check_range(s1 - s3), 8);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+ x8 = WRAPLOW(check_range(s8 + s10), 8);
+ x9 = WRAPLOW(check_range(s9 + s11), 8);
+ x10 = WRAPLOW(check_range(s8 - s10), 8);
+ x11 = WRAPLOW(check_range(s9 - s11), 8);
+ x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
+ x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
+ x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
+ x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (- x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (- x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+ x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+ x10 = WRAPLOW(dct_const_round_shift(s10), 8);
+ x11 = WRAPLOW(dct_const_round_shift(s11), 8);
+ x14 = WRAPLOW(dct_const_round_shift(s14), 8);
+ x15 = WRAPLOW(dct_const_round_shift(s15), 8);
+
+ output[0] = WRAPLOW(x0, 8);
+ output[1] = WRAPLOW(-x8, 8);
+ output[2] = WRAPLOW(x12, 8);
+ output[3] = WRAPLOW(-x4, 8);
+ output[4] = WRAPLOW(x6, 8);
+ output[5] = WRAPLOW(x14, 8);
+ output[6] = WRAPLOW(x10, 8);
+ output[7] = WRAPLOW(x2, 8);
+ output[8] = WRAPLOW(x3, 8);
+ output[9] = WRAPLOW(x11, 8);
+ output[10] = WRAPLOW(x15, 8);
+ output[11] = WRAPLOW(x7, 8);
+ output[12] = WRAPLOW(x5, 8);
+ output[13] = WRAPLOW(-x13, 8);
+ output[14] = WRAPLOW(x9, 8);
+ output[15] = WRAPLOW(-x1, 8);
+}
+
+void vp10_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ vp10_idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ vp10_idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(
+ dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vp10_idct16x16_1_add_c(const tran_low_t *input,
+ uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+ temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+ temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+ temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+ temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+ temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+ temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+ temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+ temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ step2[16] = WRAPLOW(step1[16] + step1[17], 8);
+ step2[17] = WRAPLOW(step1[16] - step1[17], 8);
+ step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
+ step2[19] = WRAPLOW(step1[18] + step1[19], 8);
+ step2[20] = WRAPLOW(step1[20] + step1[21], 8);
+ step2[21] = WRAPLOW(step1[20] - step1[21], 8);
+ step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
+ step2[23] = WRAPLOW(step1[22] + step1[23], 8);
+ step2[24] = WRAPLOW(step1[24] + step1[25], 8);
+ step2[25] = WRAPLOW(step1[24] - step1[25], 8);
+ step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
+ step2[27] = WRAPLOW(step1[26] + step1[27], 8);
+ step2[28] = WRAPLOW(step1[28] + step1[29], 8);
+ step2[29] = WRAPLOW(step1[28] - step1[29], 8);
+ step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
+ step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+ step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+ step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+ step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+ step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+ step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+ step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+ step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+ temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+ temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+ temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+ temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+ step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+ step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = WRAPLOW(step1[16] + step1[19], 8);
+ step2[17] = WRAPLOW(step1[17] + step1[18], 8);
+ step2[18] = WRAPLOW(step1[17] - step1[18], 8);
+ step2[19] = WRAPLOW(step1[16] - step1[19], 8);
+ step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
+ step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
+ step2[22] = WRAPLOW(step1[21] + step1[22], 8);
+ step2[23] = WRAPLOW(step1[20] + step1[23], 8);
+
+ step2[24] = WRAPLOW(step1[24] + step1[27], 8);
+ step2[25] = WRAPLOW(step1[25] + step1[26], 8);
+ step2[26] = WRAPLOW(step1[25] - step1[26], 8);
+ step2[27] = WRAPLOW(step1[24] - step1[27], 8);
+ step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
+ step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
+ step2[30] = WRAPLOW(step1[29] + step1[30], 8);
+ step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+ step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+ step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+ step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+ step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+ step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+ step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+ step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+ step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+ step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+ step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+ temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+ temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+ temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+ temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+ step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+ step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+ step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+ step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+ step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+ step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+ step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = WRAPLOW(step1[16] + step1[23], 8);
+ step2[17] = WRAPLOW(step1[17] + step1[22], 8);
+ step2[18] = WRAPLOW(step1[18] + step1[21], 8);
+ step2[19] = WRAPLOW(step1[19] + step1[20], 8);
+ step2[20] = WRAPLOW(step1[19] - step1[20], 8);
+ step2[21] = WRAPLOW(step1[18] - step1[21], 8);
+ step2[22] = WRAPLOW(step1[17] - step1[22], 8);
+ step2[23] = WRAPLOW(step1[16] - step1[23], 8);
+
+ step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
+ step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
+ step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
+ step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
+ step2[28] = WRAPLOW(step1[27] + step1[28], 8);
+ step2[29] = WRAPLOW(step1[26] + step1[29], 8);
+ step2[30] = WRAPLOW(step1[25] + step1[30], 8);
+ step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+
+ // stage 7
+ step1[0] = WRAPLOW(step2[0] + step2[15], 8);
+ step1[1] = WRAPLOW(step2[1] + step2[14], 8);
+ step1[2] = WRAPLOW(step2[2] + step2[13], 8);
+ step1[3] = WRAPLOW(step2[3] + step2[12], 8);
+ step1[4] = WRAPLOW(step2[4] + step2[11], 8);
+ step1[5] = WRAPLOW(step2[5] + step2[10], 8);
+ step1[6] = WRAPLOW(step2[6] + step2[9], 8);
+ step1[7] = WRAPLOW(step2[7] + step2[8], 8);
+ step1[8] = WRAPLOW(step2[7] - step2[8], 8);
+ step1[9] = WRAPLOW(step2[6] - step2[9], 8);
+ step1[10] = WRAPLOW(step2[5] - step2[10], 8);
+ step1[11] = WRAPLOW(step2[4] - step2[11], 8);
+ step1[12] = WRAPLOW(step2[3] - step2[12], 8);
+ step1[13] = WRAPLOW(step2[2] - step2[13], 8);
+ step1[14] = WRAPLOW(step2[1] - step2[14], 8);
+ step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WRAPLOW(step1[0] + step1[31], 8);
+ output[1] = WRAPLOW(step1[1] + step1[30], 8);
+ output[2] = WRAPLOW(step1[2] + step1[29], 8);
+ output[3] = WRAPLOW(step1[3] + step1[28], 8);
+ output[4] = WRAPLOW(step1[4] + step1[27], 8);
+ output[5] = WRAPLOW(step1[5] + step1[26], 8);
+ output[6] = WRAPLOW(step1[6] + step1[25], 8);
+ output[7] = WRAPLOW(step1[7] + step1[24], 8);
+ output[8] = WRAPLOW(step1[8] + step1[23], 8);
+ output[9] = WRAPLOW(step1[9] + step1[22], 8);
+ output[10] = WRAPLOW(step1[10] + step1[21], 8);
+ output[11] = WRAPLOW(step1[11] + step1[20], 8);
+ output[12] = WRAPLOW(step1[12] + step1[19], 8);
+ output[13] = WRAPLOW(step1[13] + step1[18], 8);
+ output[14] = WRAPLOW(step1[14] + step1[17], 8);
+ output[15] = WRAPLOW(step1[15] + step1[16], 8);
+ output[16] = WRAPLOW(step1[15] - step1[16], 8);
+ output[17] = WRAPLOW(step1[14] - step1[17], 8);
+ output[18] = WRAPLOW(step1[13] - step1[18], 8);
+ output[19] = WRAPLOW(step1[12] - step1[19], 8);
+ output[20] = WRAPLOW(step1[11] - step1[20], 8);
+ output[21] = WRAPLOW(step1[10] - step1[21], 8);
+ output[22] = WRAPLOW(step1[9] - step1[22], 8);
+ output[23] = WRAPLOW(step1[8] - step1[23], 8);
+ output[24] = WRAPLOW(step1[7] - step1[24], 8);
+ output[25] = WRAPLOW(step1[6] - step1[25], 8);
+ output[26] = WRAPLOW(step1[5] - step1[26], 8);
+ output[27] = WRAPLOW(step1[4] - step1[27], 8);
+ output[28] = WRAPLOW(step1[3] - step1[28], 8);
+ output[29] = WRAPLOW(step1[2] - step1[29], 8);
+ output[30] = WRAPLOW(step1[1] - step1[30], 8);
+ output[31] = WRAPLOW(step1[0] - step1[31], 8);
+}
+
+void vp10_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ int16_t zero_coeff[16];
+ for (j = 0; j < 16; ++j)
+ zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ vp10_idct32_c(input, outptr);
+ else
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ vp10_idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vp10_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32] = {0};
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ // only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i) {
+ vp10_idct32_c(input, outptr);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ vp10_idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(
+ dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vp10_idct32x32_1_add_c(const tran_low_t *input,
+ uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_high_t a1;
+
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i)
+ dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WRAPLOW(a1, bd);
+ op[1] = WRAPLOW(b1, bd);
+ op[2] = WRAPLOW(c1, bd);
+ op[3] = WRAPLOW(d1, bd);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+ dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+ dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+ dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+ ip++;
+ dest++;
+ }
+}
+
+void vp10_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ (void) bd;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WRAPLOW(a1, bd);
+ op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] = highbd_clip_pixel_add(
+ dest[dest_stride * 0], a1, bd);
+ dest[dest_stride * 1] = highbd_clip_pixel_add(
+ dest[dest_stride * 1], e1, bd);
+ dest[dest_stride * 2] = highbd_clip_pixel_add(
+ dest[dest_stride * 2], e1, bd);
+ dest[dest_stride * 3] = highbd_clip_pixel_add(
+ dest[dest_stride * 3], e1, bd);
+ ip++;
+ dest++;
+ }
+}
+
+void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ (void) bd;
+ // stage 1
+ temp1 = (input[0] + input[2]) * cospi_16_64;
+ temp2 = (input[0] - input[2]) * cospi_16_64;
+ step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ // stage 2
+ output[0] = WRAPLOW(step[0] + step[3], bd);
+ output[1] = WRAPLOW(step[1] + step[2], bd);
+ output[2] = WRAPLOW(step[1] - step[2], bd);
+ output[3] = WRAPLOW(step[0] - step[3], bd);
+}
+
+void vp10_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct4_c(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ vp10_highbd_idct4_c(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+ dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+ dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+ dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
+ dest += dest_stride;
+ }
+}
+
+void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ // stage 2 & stage 3 - even half
+ vp10_highbd_idct4_c(step1, step1, bd);
+
+ // stage 2 - odd half
+ step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+ // stage 3 - odd half
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WRAPLOW(step1[0] + step1[7], bd);
+ output[1] = WRAPLOW(step1[1] + step1[6], bd);
+ output[2] = WRAPLOW(step1[2] + step1[5], bd);
+ output[3] = WRAPLOW(step1[3] + step1[4], bd);
+ output[4] = WRAPLOW(step1[3] - step1[4], bd);
+ output[5] = WRAPLOW(step1[2] - step1[5], bd);
+ output[6] = WRAPLOW(step1[1] - step1[6], bd);
+ output[7] = WRAPLOW(step1[0] - step1[7], bd);
+}
+
+void vp10_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ for (i = 0; i < 8; ++i) {
+ vp10_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i)
+ dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3)) {
+ memset(output, 0, 4 * sizeof(*output));
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = (tran_high_t)(x0 - x2 + x3);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+ output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+ output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+}
+
+void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_low_t x0 = input[7];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[5];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[3];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[1];
+ tran_low_t x7 = input[6];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ memset(output, 0, 8 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = WRAPLOW(s0 + s2, bd);
+ x1 = WRAPLOW(s1 + s3, bd);
+ x2 = WRAPLOW(s0 - s2, bd);
+ x3 = WRAPLOW(s1 - s3, bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+
+ output[0] = WRAPLOW(x0, bd);
+ output[1] = WRAPLOW(-x4, bd);
+ output[2] = WRAPLOW(x6, bd);
+ output[3] = WRAPLOW(-x2, bd);
+ output[4] = WRAPLOW(x3, bd);
+ output[5] = WRAPLOW(-x7, bd);
+ output[6] = WRAPLOW(x5, bd);
+ output[7] = WRAPLOW(-x1, bd);
+}
+
+void vp10_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ // Only first 4 row has non-zero coefs.
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ // Then transform columns.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+ (void) bd;
+
+ // stage 1
+ step1[0] = input[0/2];
+ step1[1] = input[16/2];
+ step1[2] = input[8/2];
+ step1[3] = input[24/2];
+ step1[4] = input[4/2];
+ step1[5] = input[20/2];
+ step1[6] = input[12/2];
+ step1[7] = input[28/2];
+ step1[8] = input[2/2];
+ step1[9] = input[18/2];
+ step1[10] = input[10/2];
+ step1[11] = input[26/2];
+ step1[12] = input[6/2];
+ step1[13] = input[22/2];
+ step1[14] = input[14/2];
+ step1[15] = input[30/2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+ step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+ step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+ step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+ step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+ step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+ step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+ step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+ step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+ step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+ step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+ step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+ step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+ step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+ step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+ step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+ step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+ step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+ step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+ step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+ step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+ step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+ step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+ step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+ step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = WRAPLOW(step2[0] + step2[15], bd);
+ output[1] = WRAPLOW(step2[1] + step2[14], bd);
+ output[2] = WRAPLOW(step2[2] + step2[13], bd);
+ output[3] = WRAPLOW(step2[3] + step2[12], bd);
+ output[4] = WRAPLOW(step2[4] + step2[11], bd);
+ output[5] = WRAPLOW(step2[5] + step2[10], bd);
+ output[6] = WRAPLOW(step2[6] + step2[9], bd);
+ output[7] = WRAPLOW(step2[7] + step2[8], bd);
+ output[8] = WRAPLOW(step2[7] - step2[8], bd);
+ output[9] = WRAPLOW(step2[6] - step2[9], bd);
+ output[10] = WRAPLOW(step2[5] - step2[10], bd);
+ output[11] = WRAPLOW(step2[4] - step2[11], bd);
+ output[12] = WRAPLOW(step2[3] - step2[12], bd);
+ output[13] = WRAPLOW(step2[2] - step2[13], bd);
+ output[14] = WRAPLOW(step2[1] - step2[14], bd);
+ output[15] = WRAPLOW(step2[0] - step2[15], bd);
+}
+
+void vp10_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ for (i = 0; i < 16; ++i) {
+ vp10_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp10_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6),
+ bd);
+ }
+ }
+}
+
+void vp10_highbd_iadst16_c(const tran_low_t *input,
+ tran_low_t *output,
+ int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_low_t x0 = input[15];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[13];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[11];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[9];
+ tran_low_t x7 = input[6];
+ tran_low_t x8 = input[7];
+ tran_low_t x9 = input[8];
+ tran_low_t x10 = input[5];
+ tran_low_t x11 = input[10];
+ tran_low_t x12 = input[3];
+ tran_low_t x13 = input[12];
+ tran_low_t x14 = input[1];
+ tran_low_t x15 = input[14];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ memset(output, 0, 16 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4, bd);
+ x1 = WRAPLOW(s1 + s5, bd);
+ x2 = WRAPLOW(s2 + s6, bd);
+ x3 = WRAPLOW(s3 + s7, bd);
+ x4 = WRAPLOW(s0 - s4, bd);
+ x5 = WRAPLOW(s1 - s5, bd);
+ x6 = WRAPLOW(s2 - s6, bd);
+ x7 = WRAPLOW(s3 - s7, bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = WRAPLOW(s0 + s2, bd);
+ x1 = WRAPLOW(s1 + s3, bd);
+ x2 = WRAPLOW(s0 - s2, bd);
+ x3 = WRAPLOW(s1 - s3, bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+ x8 = WRAPLOW(s8 + s10, bd);
+ x9 = WRAPLOW(s9 + s11, bd);
+ x10 = WRAPLOW(s8 - s10, bd);
+ x11 = WRAPLOW(s9 - s11, bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
+
+ output[0] = WRAPLOW(x0, bd);
+ output[1] = WRAPLOW(-x8, bd);
+ output[2] = WRAPLOW(x12, bd);
+ output[3] = WRAPLOW(-x4, bd);
+ output[4] = WRAPLOW(x6, bd);
+ output[5] = WRAPLOW(x14, bd);
+ output[6] = WRAPLOW(x10, bd);
+ output[7] = WRAPLOW(x2, bd);
+ output[8] = WRAPLOW(x3, bd);
+ output[9] = WRAPLOW(x11, bd);
+ output[10] = WRAPLOW(x15, bd);
+ output[11] = WRAPLOW(x7, bd);
+ output[12] = WRAPLOW(x5, bd);
+ output[13] = WRAPLOW(-x13, bd);
+ output[14] = WRAPLOW(x9, bd);
+ output[15] = WRAPLOW(-x1, bd);
+}
+
+void vp10_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ vp10_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+static void highbd_idct32_c(const tran_low_t *input,
+ tran_low_t *output, int bd) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+ (void) bd;
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+ temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+ temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+ temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+ temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+ temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+ temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+ temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+ temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ step2[16] = WRAPLOW(step1[16] + step1[17], bd);
+ step2[17] = WRAPLOW(step1[16] - step1[17], bd);
+ step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
+ step2[19] = WRAPLOW(step1[18] + step1[19], bd);
+ step2[20] = WRAPLOW(step1[20] + step1[21], bd);
+ step2[21] = WRAPLOW(step1[20] - step1[21], bd);
+ step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
+ step2[23] = WRAPLOW(step1[22] + step1[23], bd);
+ step2[24] = WRAPLOW(step1[24] + step1[25], bd);
+ step2[25] = WRAPLOW(step1[24] - step1[25], bd);
+ step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
+ step2[27] = WRAPLOW(step1[26] + step1[27], bd);
+ step2[28] = WRAPLOW(step1[28] + step1[29], bd);
+ step2[29] = WRAPLOW(step1[28] - step1[29], bd);
+ step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
+ step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+ step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+ step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+ step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+ step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+ step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+ step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+ step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+ temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+ temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+ temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+ temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = WRAPLOW(step1[16] + step1[19], bd);
+ step2[17] = WRAPLOW(step1[17] + step1[18], bd);
+ step2[18] = WRAPLOW(step1[17] - step1[18], bd);
+ step2[19] = WRAPLOW(step1[16] - step1[19], bd);
+ step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
+ step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
+ step2[22] = WRAPLOW(step1[21] + step1[22], bd);
+ step2[23] = WRAPLOW(step1[20] + step1[23], bd);
+
+ step2[24] = WRAPLOW(step1[24] + step1[27], bd);
+ step2[25] = WRAPLOW(step1[25] + step1[26], bd);
+ step2[26] = WRAPLOW(step1[25] - step1[26], bd);
+ step2[27] = WRAPLOW(step1[24] - step1[27], bd);
+ step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
+ step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
+ step2[30] = WRAPLOW(step1[29] + step1[30], bd);
+ step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+ step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+ step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+ step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+ step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+ step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+ step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+ step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+ step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+ step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+ step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+ temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+ temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+ temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+ temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+ step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+ step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+ step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+ step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+ step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+ step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+ step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = WRAPLOW(step1[16] + step1[23], bd);
+ step2[17] = WRAPLOW(step1[17] + step1[22], bd);
+ step2[18] = WRAPLOW(step1[18] + step1[21], bd);
+ step2[19] = WRAPLOW(step1[19] + step1[20], bd);
+ step2[20] = WRAPLOW(step1[19] - step1[20], bd);
+ step2[21] = WRAPLOW(step1[18] - step1[21], bd);
+ step2[22] = WRAPLOW(step1[17] - step1[22], bd);
+ step2[23] = WRAPLOW(step1[16] - step1[23], bd);
+
+ step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
+ step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
+ step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
+ step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
+ step2[28] = WRAPLOW(step1[27] + step1[28], bd);
+ step2[29] = WRAPLOW(step1[26] + step1[29], bd);
+ step2[30] = WRAPLOW(step1[25] + step1[30], bd);
+ step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+
+ // stage 7
+ step1[0] = WRAPLOW(step2[0] + step2[15], bd);
+ step1[1] = WRAPLOW(step2[1] + step2[14], bd);
+ step1[2] = WRAPLOW(step2[2] + step2[13], bd);
+ step1[3] = WRAPLOW(step2[3] + step2[12], bd);
+ step1[4] = WRAPLOW(step2[4] + step2[11], bd);
+ step1[5] = WRAPLOW(step2[5] + step2[10], bd);
+ step1[6] = WRAPLOW(step2[6] + step2[9], bd);
+ step1[7] = WRAPLOW(step2[7] + step2[8], bd);
+ step1[8] = WRAPLOW(step2[7] - step2[8], bd);
+ step1[9] = WRAPLOW(step2[6] - step2[9], bd);
+ step1[10] = WRAPLOW(step2[5] - step2[10], bd);
+ step1[11] = WRAPLOW(step2[4] - step2[11], bd);
+ step1[12] = WRAPLOW(step2[3] - step2[12], bd);
+ step1[13] = WRAPLOW(step2[2] - step2[13], bd);
+ step1[14] = WRAPLOW(step2[1] - step2[14], bd);
+ step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WRAPLOW(step1[0] + step1[31], bd);
+ output[1] = WRAPLOW(step1[1] + step1[30], bd);
+ output[2] = WRAPLOW(step1[2] + step1[29], bd);
+ output[3] = WRAPLOW(step1[3] + step1[28], bd);
+ output[4] = WRAPLOW(step1[4] + step1[27], bd);
+ output[5] = WRAPLOW(step1[5] + step1[26], bd);
+ output[6] = WRAPLOW(step1[6] + step1[25], bd);
+ output[7] = WRAPLOW(step1[7] + step1[24], bd);
+ output[8] = WRAPLOW(step1[8] + step1[23], bd);
+ output[9] = WRAPLOW(step1[9] + step1[22], bd);
+ output[10] = WRAPLOW(step1[10] + step1[21], bd);
+ output[11] = WRAPLOW(step1[11] + step1[20], bd);
+ output[12] = WRAPLOW(step1[12] + step1[19], bd);
+ output[13] = WRAPLOW(step1[13] + step1[18], bd);
+ output[14] = WRAPLOW(step1[14] + step1[17], bd);
+ output[15] = WRAPLOW(step1[15] + step1[16], bd);
+ output[16] = WRAPLOW(step1[15] - step1[16], bd);
+ output[17] = WRAPLOW(step1[14] - step1[17], bd);
+ output[18] = WRAPLOW(step1[13] - step1[18], bd);
+ output[19] = WRAPLOW(step1[12] - step1[19], bd);
+ output[20] = WRAPLOW(step1[11] - step1[20], bd);
+ output[21] = WRAPLOW(step1[10] - step1[21], bd);
+ output[22] = WRAPLOW(step1[9] - step1[22], bd);
+ output[23] = WRAPLOW(step1[8] - step1[23], bd);
+ output[24] = WRAPLOW(step1[7] - step1[24], bd);
+ output[25] = WRAPLOW(step1[6] - step1[25], bd);
+ output[26] = WRAPLOW(step1[5] - step1[26], bd);
+ output[27] = WRAPLOW(step1[4] - step1[27], bd);
+ output[28] = WRAPLOW(step1[3] - step1[28], bd);
+ output[29] = WRAPLOW(step1[2] - step1[29], bd);
+ output[30] = WRAPLOW(step1[1] - step1[30], bd);
+ output[31] = WRAPLOW(step1[0] - step1[31], bd);
+}
+
+void vp10_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_low_t zero_coeff[16];
+ for (j = 0; j < 16; ++j)
+ zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ highbd_idct32_c(input, outptr, bd);
+ else
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ highbd_idct32_c(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[32 * 32] = {0};
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ // Only upper-left 8x8 has non-zero coeff.
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32_c(input, outptr, bd);
+ input += 32;
+ outptr += 32;
+ }
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ highbd_idct32_c(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ int a1;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i)
+ dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/vp10_inv_txfm.h b/vp10/common/vp10_inv_txfm.h
new file mode 100644
index 0000000..52611ac
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_INV_TXFM_H_
+#define VPX_DSP_INV_TXFM_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE tran_low_t check_range(tran_high_t input) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid VP9 input streams, intermediate stage coefficients should always
+ // stay within the range of a signed 16 bit integer. Coefficients can go out
+ // of this range for invalid/corrupt VP9 streams. However, strictly checking
+ // this range for every intermediate coefficient can burdensome for a decoder,
+ // therefore the following assertion is only enabled when configured with
+ // --enable-coefficient-range-checking.
+ assert(INT16_MIN <= input);
+ assert(input <= INT16_MAX);
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ return (tran_low_t)input;
+}
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return check_range(rv);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+ int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+ const int32_t int_max = (1 << (7 + bd)) - 1;
+ const int32_t int_min = -int_max - 1;
+ assert(int_min <= input);
+ assert(input <= int_max);
+ (void) int_min;
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ (void) bd;
+ return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+ int bd) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return highbd_check_range(rv, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) ((int32_t)(x))
+#endif // CONFIG_EMULATE_HARDWARE
+
+void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct8_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct16_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct32_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+ int bd) {
+ trans = WRAPLOW(trans, bd);
+ return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
+#endif
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+ trans = WRAPLOW(trans, 8);
+ return clip_pixel(WRAPLOW(dest + trans, 8));
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // VPX_DSP_INV_TXFM_H_
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 37b9622..0d14ad8 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -95,6 +95,57 @@
add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp10_iht16x16_256_add/;
+
+ add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4 sse2/;
+
+ add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4_1 sse2/;
+
+ add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8 sse2/;
+
+ add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8_1 sse2/;
+
+ add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16 sse2/;
+
+ add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16_1 sse2/;
+
+ add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32 sse2/;
+
+ add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_rd sse2/;
+
+ add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_1 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct4x4 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct8x8 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct8x8_1/;
+
+ add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct16x16 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct16x16_1/;
+
+ add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32_rd sse2/;
+
+ add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32_1/;
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
@@ -106,6 +157,33 @@
add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp10_iht16x16_256_add/;
+
+ add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4/;
+
+ add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4_1/;
+
+ add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8/;
+
+ add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8_1/;
+
+ add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16/;
+
+ add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16_1/;
+
+ add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32/;
+
+ add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_rd/;
+
+ add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_1/;
} else {
add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp10_iht4x4_16_add sse2 neon dspr2 msa/;
@@ -115,6 +193,33 @@
add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp10_iht16x16_256_add sse2 dspr2 msa/;
+
+ add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4 sse2/;
+
+ add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4_1 sse2/;
+
+ add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8 sse2/;
+
+ add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8_1 sse2/;
+
+ add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16 sse2/;
+
+ add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16_1 sse2/;
+
+ add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32 sse2/;
+
+ add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_rd sse2/;
+
+ add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_1 sse2/;
}
}
@@ -289,6 +394,188 @@
specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
}
+# Inverse transform
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ # Note as optimized versions of these functions are added we need to add a check to ensure
+ # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+ add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_1_add/;
+
+ add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_16_add/;
+
+ add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_1_add/;
+
+ add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_64_add/;
+
+ add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_12_add/;
+
+ add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_1_add/;
+
+ add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_256_add/;
+
+ add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_10_add/;
+
+ add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1024_add/;
+
+ add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_34_add/;
+
+ add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_16_add/;
+
+ add_proto qw/void vp10_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct4x4_1_add/;
+
+ add_proto qw/void vp10_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_1_add/;
+
+ add_proto qw/void vp10_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_1_add/;
+
+ add_proto qw/void vp10_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct32x32_1024_add/;
+
+ add_proto qw/void vp10_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct32x32_34_add/;
+
+ add_proto qw/void vp10_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct32x32_1_add/;
+
+ add_proto qw/void vp10_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_iwht4x4_1_add/;
+
+ add_proto qw/void vp10_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_iwht4x4_16_add/;
+
+ # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+ add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct4x4_16_add/;
+
+ add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_64_add/;
+
+ add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_10_add/;
+
+ add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_256_add/;
+
+ add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_10_add/;
+ } else {
+ add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct4x4_16_add sse2/;
+
+ add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_64_add sse2/;
+
+ add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_10_add sse2/;
+
+ add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_256_add sse2/;
+
+ add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_10_add sse2/;
+ } # CONFIG_EMULATE_HARDWARE
+} else {
+ # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+ add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_1_add/;
+
+ add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_16_add/;
+
+ add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_1_add/;
+
+ add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_64_add/;
+
+ add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_12_add/;
+
+ add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_1_add/;
+
+ add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_256_add/;
+
+ add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_10_add/;
+
+ add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1024_add/;
+
+ add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_34_add/;
+
+ add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_16_add/;
+ } else {
+ add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_1_add sse2/;
+
+ add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_16_add sse2/;
+
+ add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_1_add sse2/;
+
+ add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_64_add sse2/;
+
+ add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_12_add sse2/;
+
+ add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_1_add sse2/;
+
+ add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_256_add sse2/;
+
+ add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_10_add sse2/;
+
+ add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1024_add sse2/;
+
+ add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_34_add sse2/;
+
+ add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1_add sse2/;
+
+ add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_16_add/;
+ } # CONFIG_EMULATE_HARDWARE
+} # CONFIG_VP9_HIGHBITDEPTH
+
#
# Motion search
#
diff --git a/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h b/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h
new file mode 100644
index 0000000..ef78709
--- /dev/null
+++ b/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h
@@ -0,0 +1,3153 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "vp10/common/vp10_fwd_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// TODO(jingning) The high bit-depth version needs re-work for performance.
+// The current SSE2 implementation also causes cross reference to the static
+// functions in the C implementation file.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+#if FDCT32x32_HIGH_PRECISION
+void vp10_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = intermediate[j * 32 + i];
+ vp10_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+ #define HIGH_FDCT32x32_2D_C vp10_highbd_fdct32x32_c
+ #define HIGH_FDCT32x32_2D_ROWS_C vp10_fdct32x32_rows_c
+#else
+void vp10_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = intermediate[j * 32 + i];
+ vp10_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] = (tran_low_t)temp_out[j];
+ }
+}
+ #define HIGH_FDCT32x32_2D_C vp10_highbd_fdct32x32_rd_c
+ #define HIGH_FDCT32x32_2D_ROWS_C vp10_fdct32x32_rd_rows_c
+#endif // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif // DCT_HIGH_BIT_DEPTH
+
+
+void FDCT32x32_2D(const int16_t *input,
+ tran_low_t *output_org, int stride) {
+ // Calculate pre-multiplied strides
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 8) {
+ __m128i step1[32];
+ __m128i step2[32];
+ __m128i step3[32];
+ __m128i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ const int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m128i *step1a = &step1[ 0];
+ __m128i *step1b = &step1[31];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m128i *step1a = &step1[ 4];
+ __m128i *step1b = &step1[27];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m128i *step1a = &step1[ 8];
+ __m128i *step1b = &step1[23];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
+ __m128i *step1a = &step1[12];
+ __m128i *step1b = &step1[19];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+ __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+ __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+ __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+ __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+ __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+ __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+ __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+ step1[0] = ADD_EPI16(in00, in31);
+ step1[1] = ADD_EPI16(in01, in30);
+ step1[2] = ADD_EPI16(in02, in29);
+ step1[3] = ADD_EPI16(in03, in28);
+ step1[28] = SUB_EPI16(in03, in28);
+ step1[29] = SUB_EPI16(in02, in29);
+ step1[30] = SUB_EPI16(in01, in30);
+ step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+ &step1[3], &step1[28], &step1[29],
+ &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+ __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+ __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+ __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+ __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+ __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+ __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+ __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+ step1[4] = ADD_EPI16(in04, in27);
+ step1[5] = ADD_EPI16(in05, in26);
+ step1[6] = ADD_EPI16(in06, in25);
+ step1[7] = ADD_EPI16(in07, in24);
+ step1[24] = SUB_EPI16(in07, in24);
+ step1[25] = SUB_EPI16(in06, in25);
+ step1[26] = SUB_EPI16(in05, in26);
+ step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+ &step1[7], &step1[24], &step1[25],
+ &step1[26], &step1[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+ __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+ __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+ __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+ __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+ __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+ __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+ __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+ step1[8] = ADD_EPI16(in08, in23);
+ step1[9] = ADD_EPI16(in09, in22);
+ step1[10] = ADD_EPI16(in10, in21);
+ step1[11] = ADD_EPI16(in11, in20);
+ step1[20] = SUB_EPI16(in11, in20);
+ step1[21] = SUB_EPI16(in10, in21);
+ step1[22] = SUB_EPI16(in09, in22);
+ step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[20], &step1[21],
+ &step1[22], &step1[23]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+ __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+ __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+ __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+ __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+ __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+ __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+ __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+ step1[12] = ADD_EPI16(in12, in19);
+ step1[13] = ADD_EPI16(in13, in18);
+ step1[14] = ADD_EPI16(in14, in17);
+ step1[15] = ADD_EPI16(in15, in16);
+ step1[16] = SUB_EPI16(in15, in16);
+ step1[17] = SUB_EPI16(in14, in17);
+ step1[18] = SUB_EPI16(in13, in18);
+ step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+ &step1[15], &step1[16], &step1[17],
+ &step1[18], &step1[19]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Stage 2
+ {
+ step2[0] = ADD_EPI16(step1[0], step1[15]);
+ step2[1] = ADD_EPI16(step1[1], step1[14]);
+ step2[2] = ADD_EPI16(step1[2], step1[13]);
+ step2[3] = ADD_EPI16(step1[3], step1[12]);
+ step2[4] = ADD_EPI16(step1[4], step1[11]);
+ step2[5] = ADD_EPI16(step1[5], step1[10]);
+ step2[6] = ADD_EPI16(step1[6], step1[ 9]);
+ step2[7] = ADD_EPI16(step1[7], step1[ 8]);
+ step2[8] = SUB_EPI16(step1[7], step1[ 8]);
+ step2[9] = SUB_EPI16(step1[6], step1[ 9]);
+ step2[10] = SUB_EPI16(step1[5], step1[10]);
+ step2[11] = SUB_EPI16(step1[4], step1[11]);
+ step2[12] = SUB_EPI16(step1[3], step1[12]);
+ step2[13] = SUB_EPI16(step1[2], step1[13]);
+ step2[14] = SUB_EPI16(step1[1], step1[14]);
+ step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[0], &step2[1], &step2[2], &step2[3],
+ &step2[4], &step2[5], &step2[6], &step2[7],
+ &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+ const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+ const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+ const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+ const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+ const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+ const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+ const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+ const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+ &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+ step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
+ step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
+ step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
+ step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
+ step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
+ step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
+ step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
+ step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
+ step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
+ step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
+ step2[10] = SUB_EPI16(step2[10], s3_10_0);
+ step2[11] = SUB_EPI16(step2[11], s3_11_0);
+ step2[12] = SUB_EPI16(step2[12], s3_12_0);
+ step2[13] = SUB_EPI16(step2[13], s3_13_0);
+ step2[14] = SUB_EPI16(step2[14], s2_14_0);
+ step2[15] = SUB_EPI16(step2[15], s2_15_0);
+ step1[16] = SUB_EPI16(step1[16], s3_16_0);
+ step1[17] = SUB_EPI16(step1[17], s3_17_0);
+ step1[18] = SUB_EPI16(step1[18], s3_18_0);
+ step1[19] = SUB_EPI16(step1[19], s3_19_0);
+ step2[20] = SUB_EPI16(step2[20], s3_20_0);
+ step2[21] = SUB_EPI16(step2[21], s3_21_0);
+ step2[22] = SUB_EPI16(step2[22], s3_22_0);
+ step2[23] = SUB_EPI16(step2[23], s3_23_0);
+ step2[24] = SUB_EPI16(step2[24], s3_24_0);
+ step2[25] = SUB_EPI16(step2[25], s3_25_0);
+ step2[26] = SUB_EPI16(step2[26], s3_26_0);
+ step2[27] = SUB_EPI16(step2[27], s3_27_0);
+ step1[28] = SUB_EPI16(step1[28], s3_28_0);
+ step1[29] = SUB_EPI16(step1[29], s3_29_0);
+ step1[30] = SUB_EPI16(step1[30], s3_30_0);
+ step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x32(
+ &step2[0], &step2[1], &step2[2], &step2[3],
+ &step2[4], &step2[5], &step2[6], &step2[7],
+ &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15],
+ &step1[16], &step1[17], &step1[18], &step1[19],
+ &step2[20], &step2[21], &step2[22], &step2[23],
+ &step2[24], &step2[25], &step2[26], &step2[27],
+ &step1[28], &step1[29], &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ step2[0] = _mm_add_epi16(step2[ 0], kOne);
+ step2[1] = _mm_add_epi16(step2[ 1], kOne);
+ step2[2] = _mm_add_epi16(step2[ 2], kOne);
+ step2[3] = _mm_add_epi16(step2[ 3], kOne);
+ step2[4] = _mm_add_epi16(step2[ 4], kOne);
+ step2[5] = _mm_add_epi16(step2[ 5], kOne);
+ step2[6] = _mm_add_epi16(step2[ 6], kOne);
+ step2[7] = _mm_add_epi16(step2[ 7], kOne);
+ step2[8] = _mm_add_epi16(step2[ 8], kOne);
+ step2[9] = _mm_add_epi16(step2[ 9], kOne);
+ step2[10] = _mm_add_epi16(step2[10], kOne);
+ step2[11] = _mm_add_epi16(step2[11], kOne);
+ step2[12] = _mm_add_epi16(step2[12], kOne);
+ step2[13] = _mm_add_epi16(step2[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step1[16] = _mm_add_epi16(step1[16], kOne);
+ step1[17] = _mm_add_epi16(step1[17], kOne);
+ step1[18] = _mm_add_epi16(step1[18], kOne);
+ step1[19] = _mm_add_epi16(step1[19], kOne);
+ step2[20] = _mm_add_epi16(step2[20], kOne);
+ step2[21] = _mm_add_epi16(step2[21], kOne);
+ step2[22] = _mm_add_epi16(step2[22], kOne);
+ step2[23] = _mm_add_epi16(step2[23], kOne);
+ step2[24] = _mm_add_epi16(step2[24], kOne);
+ step2[25] = _mm_add_epi16(step2[25], kOne);
+ step2[26] = _mm_add_epi16(step2[26], kOne);
+ step2[27] = _mm_add_epi16(step2[27], kOne);
+ step1[28] = _mm_add_epi16(step1[28], kOne);
+ step1[29] = _mm_add_epi16(step1[29], kOne);
+ step1[30] = _mm_add_epi16(step1[30], kOne);
+ step1[31] = _mm_add_epi16(step1[31], kOne);
+
+ step2[0] = _mm_srai_epi16(step2[ 0], 2);
+ step2[1] = _mm_srai_epi16(step2[ 1], 2);
+ step2[2] = _mm_srai_epi16(step2[ 2], 2);
+ step2[3] = _mm_srai_epi16(step2[ 3], 2);
+ step2[4] = _mm_srai_epi16(step2[ 4], 2);
+ step2[5] = _mm_srai_epi16(step2[ 5], 2);
+ step2[6] = _mm_srai_epi16(step2[ 6], 2);
+ step2[7] = _mm_srai_epi16(step2[ 7], 2);
+ step2[8] = _mm_srai_epi16(step2[ 8], 2);
+ step2[9] = _mm_srai_epi16(step2[ 9], 2);
+ step2[10] = _mm_srai_epi16(step2[10], 2);
+ step2[11] = _mm_srai_epi16(step2[11], 2);
+ step2[12] = _mm_srai_epi16(step2[12], 2);
+ step2[13] = _mm_srai_epi16(step2[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step1[16] = _mm_srai_epi16(step1[16], 2);
+ step1[17] = _mm_srai_epi16(step1[17], 2);
+ step1[18] = _mm_srai_epi16(step1[18], 2);
+ step1[19] = _mm_srai_epi16(step1[19], 2);
+ step2[20] = _mm_srai_epi16(step2[20], 2);
+ step2[21] = _mm_srai_epi16(step2[21], 2);
+ step2[22] = _mm_srai_epi16(step2[22], 2);
+ step2[23] = _mm_srai_epi16(step2[23], 2);
+ step2[24] = _mm_srai_epi16(step2[24], 2);
+ step2[25] = _mm_srai_epi16(step2[25], 2);
+ step2[26] = _mm_srai_epi16(step2[26], 2);
+ step2[27] = _mm_srai_epi16(step2[27], 2);
+ step1[28] = _mm_srai_epi16(step1[28], 2);
+ step1[29] = _mm_srai_epi16(step1[29], 2);
+ step1[30] = _mm_srai_epi16(step1[30], 2);
+ step1[31] = _mm_srai_epi16(step1[31], 2);
+ }
+#endif // !FDCT32x32_HIGH_PRECISION
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+ step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+ step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+ step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+ step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+ step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+ step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+ step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+ &step3[3], &step3[4], &step3[5],
+ &step3[6], &step3[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
+ &step3[12], &step3[13]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step3[16] = ADD_EPI16(step2[23], step1[16]);
+ step3[17] = ADD_EPI16(step2[22], step1[17]);
+ step3[18] = ADD_EPI16(step2[21], step1[18]);
+ step3[19] = ADD_EPI16(step2[20], step1[19]);
+ step3[20] = SUB_EPI16(step1[19], step2[20]);
+ step3[21] = SUB_EPI16(step1[18], step2[21]);
+ step3[22] = SUB_EPI16(step1[17], step2[22]);
+ step3[23] = SUB_EPI16(step1[16], step2[23]);
+ step3[24] = SUB_EPI16(step1[31], step2[24]);
+ step3[25] = SUB_EPI16(step1[30], step2[25]);
+ step3[26] = SUB_EPI16(step1[29], step2[26]);
+ step3[27] = SUB_EPI16(step1[28], step2[27]);
+ step3[28] = ADD_EPI16(step2[27], step1[28]);
+ step3[29] = ADD_EPI16(step2[26], step1[29]);
+ step3[30] = ADD_EPI16(step2[25], step1[30]);
+ step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step3[16], &step3[17], &step3[18], &step3[19],
+ &step3[20], &step3[21], &step3[22], &step3[23],
+ &step3[24], &step3[25], &step3[26], &step3[27],
+ &step3[28], &step3[29], &step3[30], &step3[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+
+ // Stage 4
+ {
+ step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
+ step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
+ step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
+ step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
+ step1[8] = ADD_EPI16(step3[11], step2[ 8]);
+ step1[9] = ADD_EPI16(step3[10], step2[ 9]);
+ step1[10] = SUB_EPI16(step2[ 9], step3[10]);
+ step1[11] = SUB_EPI16(step2[ 8], step3[11]);
+ step1[12] = SUB_EPI16(step2[15], step3[12]);
+ step1[13] = SUB_EPI16(step2[14], step3[13]);
+ step1[14] = ADD_EPI16(step3[13], step2[14]);
+ step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[0], &step1[1], &step1[2], &step1[3],
+ &step1[4], &step1[5], &step1[6], &step1[7],
+ &step1[8], &step1[9], &step1[10], &step1[11],
+ &step1[12], &step1[13], &step1[14], &step1[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[26], &step1[27],
+ &step1[28], &step1[29]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 5
+ {
+ step2[4] = ADD_EPI16(step1[5], step3[4]);
+ step2[5] = SUB_EPI16(step3[4], step1[5]);
+ step2[6] = SUB_EPI16(step3[7], step1[6]);
+ step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
+ &step2[6], &step2[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&out[0], &out[16],
+ &out[8], &out[24]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
+ &step2[13], &step2[14]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step2[16] = ADD_EPI16(step1[19], step3[16]);
+ step2[17] = ADD_EPI16(step1[18], step3[17]);
+ step2[18] = SUB_EPI16(step3[17], step1[18]);
+ step2[19] = SUB_EPI16(step3[16], step1[19]);
+ step2[20] = SUB_EPI16(step3[23], step1[20]);
+ step2[21] = SUB_EPI16(step3[22], step1[21]);
+ step2[22] = ADD_EPI16(step1[21], step3[22]);
+ step2[23] = ADD_EPI16(step1[20], step3[23]);
+ step2[24] = ADD_EPI16(step1[27], step3[24]);
+ step2[25] = ADD_EPI16(step1[26], step3[25]);
+ step2[26] = SUB_EPI16(step3[25], step1[26]);
+ step2[27] = SUB_EPI16(step3[24], step1[27]);
+ step2[28] = SUB_EPI16(step3[31], step1[28]);
+ step2[29] = SUB_EPI16(step3[30], step1[29]);
+ step2[30] = ADD_EPI16(step1[29], step3[30]);
+ step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[16], &step2[17], &step2[18], &step2[19],
+ &step2[20], &step2[21], &step2[22], &step2[23],
+ &step2[24], &step2[25], &step2[26], &step2[27],
+ &step2[28], &step2[29], &step2[30], &step2[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&out[4], &out[20],
+ &out[12], &out[28]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
+ step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
+ step3[10] = SUB_EPI16(step1[11], step2[10]);
+ step3[11] = ADD_EPI16(step2[10], step1[11]);
+ step3[12] = ADD_EPI16(step2[13], step1[12]);
+ step3[13] = SUB_EPI16(step1[12], step2[13]);
+ step3[14] = SUB_EPI16(step1[15], step2[14]);
+ step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+ &step3[11], &step3[12], &step3[13],
+ &step3[14], &step3[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+ &step3[22], &step3[25], &step3[26],
+ &step3[29], &step3[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+ &out[26], &out[6], &out[22],
+ &out[14], &out[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step1[16] = ADD_EPI16(step3[17], step2[16]);
+ step1[17] = SUB_EPI16(step2[16], step3[17]);
+ step1[18] = SUB_EPI16(step2[19], step3[18]);
+ step1[19] = ADD_EPI16(step3[18], step2[19]);
+ step1[20] = ADD_EPI16(step3[21], step2[20]);
+ step1[21] = SUB_EPI16(step2[20], step3[21]);
+ step1[22] = SUB_EPI16(step2[23], step3[22]);
+ step1[23] = ADD_EPI16(step3[22], step2[23]);
+ step1[24] = ADD_EPI16(step3[25], step2[24]);
+ step1[25] = SUB_EPI16(step2[24], step3[25]);
+ step1[26] = SUB_EPI16(step2[27], step3[26]);
+ step1[27] = ADD_EPI16(step3[26], step2[27]);
+ step1[28] = ADD_EPI16(step3[29], step2[28]);
+ step1[29] = SUB_EPI16(step2[28], step3[29]);
+ step1[30] = SUB_EPI16(step2[31], step3[30]);
+ step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[16], &step1[17], &step1[18], &step1[19],
+ &step1[20], &step1[21], &step1[22], &step1[23],
+ &step1[24], &step1[25], &step1[26], &step1[27],
+ &step1[28], &step1[29], &step1[30], &step1[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+ &out[25], &out[7], &out[23],
+ &out[15], &out[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+ &out[29], &out[3], &out[19],
+ &out[11], &out[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m128i lstep1[64], lstep2[64], lstep3[64];
+ __m128i u[32], v[32], sign[16];
+ const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
+ lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
+ lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
+ lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
+ lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
+ lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
+ lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
+ lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
+ lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
+ lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
+ lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
+ lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
+ lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
+ lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
+ lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
+ lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
+ lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
+ lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
+ lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
+ lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
+ lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
+ lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
+ lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
+ lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
+ lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
+ lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+ lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
+ lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
+ lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
+ lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
+ lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
+ lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
+
+ lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
+ lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
+ lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
+ lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
+ lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
+ lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
+ lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
+ lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
+ lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
+ lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
+ lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
+ lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
+ lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
+ lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
+ lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
+ lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
+ lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
+ lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
+ lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
+ lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
+ lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
+ lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
+ lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
+ lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
+ lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
+ lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
+ lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
+ lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
+ lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
+ lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
+ lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
+ lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
+ lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
+ lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
+ lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
+ lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
+ lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
+ lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
+ lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
+ lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
+ lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
+ lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
+ lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
+ lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
+ lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
+ lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
+ lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
+
+ lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
+ lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
+ lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
+ lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
+ lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
+ lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
+ lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
+ lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
+ lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
+ lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
+ lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
+ lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
+ lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
+ lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
+ lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
+ lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
+ lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
+ lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
+ lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
+ lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
+ lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
+ lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
+ lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
+ lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
+ lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
+ lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
+ lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
+ lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
+ lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
+ lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
+ lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
+ lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
+
+ lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+ lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+
+ lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+ lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+ lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+ lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+ lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+ lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+ lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+ lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+ lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+ lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+ lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+ lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+ lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+ lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+ lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+ lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+ lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+ lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+ lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+ lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+ lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+ lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+ lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+ lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+ lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+ lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+ lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+ lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+ lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+ lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
+ lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
+ lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
+ lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+ lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
+ lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
+ lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
+ lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
+ lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
+ lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
+ lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
+ lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
+ lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
+ lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
+ lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
+ lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
+
+ lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
+ lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
+ lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
+ lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
+ lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
+ lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
+ lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
+ lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
+ lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
+ &v[4], &v[5], &v[6], &v[7], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
+ v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
+ v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
+ v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
+ v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
+ v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
+ v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
+ v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
+ v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
+ v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+ v[10] = k_madd_epi32(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
+ v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+ v[22] = k_madd_epi32(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
+ v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
+ v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
+ v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
+ v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
+ v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
+ v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
+ v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
+ lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
+ lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
+ lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
+ lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
+ v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
+ v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
+ v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
+ v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
+ v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
+ v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
+ v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
+ v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
+ v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32(u[7], k32_m08_p24);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ // Combine
+ out[ 0] = _mm_packs_epi32(u[0], u[1]);
+ out[16] = _mm_packs_epi32(u[2], u[3]);
+ out[ 8] = _mm_packs_epi32(u[4], u[5]);
+ out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&out[0], &out[16],
+ &out[8], &out[24]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32(u[7], k32_m24_m08);
+ v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
+ v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+ u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+ u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+ u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+ u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+ u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+ u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+ u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+ v[0] = k_madd_epi32(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32(u[7], k32_p12_p20);
+ v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+ v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32(u[15], k32_m04_p28);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ out[ 4] = _mm_packs_epi32(u[0], u[1]);
+ out[20] = _mm_packs_epi32(u[2], u[3]);
+ out[12] = _mm_packs_epi32(u[4], u[5]);
+ out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&out[4], &out[20],
+ &out[12], &out[28]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
+ -cospi_20_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
+ v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
+ v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
+ v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
+ v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
+ v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
+ v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
+ v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
+ v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+ v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
+ v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
+ v[22] = k_madd_epi32(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
+ v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
+ v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
+ v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
+ v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
+ v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
+ v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
+ v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
+ const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
+ const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
+ const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
+ const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
+ v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
+ v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
+ v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
+ v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
+ v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
+ v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
+ v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
+ v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
+ v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
+ v[10] = k_madd_epi32(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
+ v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
+ v[22] = k_madd_epi32(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
+ v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
+ v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
+ v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
+ v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
+ v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
+ v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
+ v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+ v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+ v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+ v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+ v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+ v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+ v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+ v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+ v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+ v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], K32One);
+ v[ 1] = _mm_add_epi32(u[ 1], K32One);
+ v[ 2] = _mm_add_epi32(u[ 2], K32One);
+ v[ 3] = _mm_add_epi32(u[ 3], K32One);
+ v[ 4] = _mm_add_epi32(u[ 4], K32One);
+ v[ 5] = _mm_add_epi32(u[ 5], K32One);
+ v[ 6] = _mm_add_epi32(u[ 6], K32One);
+ v[ 7] = _mm_add_epi32(u[ 7], K32One);
+ v[ 8] = _mm_add_epi32(u[ 8], K32One);
+ v[ 9] = _mm_add_epi32(u[ 9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[ 0] = _mm_srai_epi32(v[ 0], 2);
+ u[ 1] = _mm_srai_epi32(v[ 1], 2);
+ u[ 2] = _mm_srai_epi32(v[ 2], 2);
+ u[ 3] = _mm_srai_epi32(v[ 3], 2);
+ u[ 4] = _mm_srai_epi32(v[ 4], 2);
+ u[ 5] = _mm_srai_epi32(v[ 5], 2);
+ u[ 6] = _mm_srai_epi32(v[ 6], 2);
+ u[ 7] = _mm_srai_epi32(v[ 7], 2);
+ u[ 8] = _mm_srai_epi32(v[ 8], 2);
+ u[ 9] = _mm_srai_epi32(v[ 9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[ 2] = _mm_packs_epi32(u[0], u[1]);
+ out[18] = _mm_packs_epi32(u[2], u[3]);
+ out[10] = _mm_packs_epi32(u[4], u[5]);
+ out[26] = _mm_packs_epi32(u[6], u[7]);
+ out[ 6] = _mm_packs_epi32(u[8], u[9]);
+ out[22] = _mm_packs_epi32(u[10], u[11]);
+ out[14] = _mm_packs_epi32(u[12], u[13]);
+ out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+ &out[26], &out[6], &out[22],
+ &out[14], &out[30]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
+ const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
+ const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
+ const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
+ const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
+ v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
+ v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
+ v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
+ v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
+ v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
+ v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
+ v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
+ v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
+ v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
+ v[10] = k_madd_epi32(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
+ v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
+ v[22] = k_madd_epi32(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
+ v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
+ v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
+ v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
+ v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
+ v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
+ v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
+ v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+ v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+ v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+ v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+ v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+ v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+ v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+ v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+ v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+ v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[ 1] = _mm_packs_epi32(u[0], u[1]);
+ out[17] = _mm_packs_epi32(u[2], u[3]);
+ out[ 9] = _mm_packs_epi32(u[4], u[5]);
+ out[25] = _mm_packs_epi32(u[6], u[7]);
+ out[ 7] = _mm_packs_epi32(u[8], u[9]);
+ out[23] = _mm_packs_epi32(u[10], u[11]);
+ out[15] = _mm_packs_epi32(u[12], u[13]);
+ out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+ &out[25], &out[7], &out[23],
+ &out[15], &out[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
+ const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
+ const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
+ const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
+ const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
+ v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
+ v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
+ v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
+ v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
+ v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
+ v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
+ v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
+ v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
+ v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
+ v[10] = k_madd_epi32(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
+ v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
+ v[22] = k_madd_epi32(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
+ v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
+ v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
+ v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
+ v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
+ v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
+ v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
+ v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+ v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+ v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+ v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+ v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+ v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+ v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+ v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+ v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+ v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[ 5] = _mm_packs_epi32(u[0], u[1]);
+ out[21] = _mm_packs_epi32(u[2], u[3]);
+ out[13] = _mm_packs_epi32(u[4], u[5]);
+ out[29] = _mm_packs_epi32(u[6], u[7]);
+ out[ 3] = _mm_packs_epi32(u[8], u[9]);
+ out[19] = _mm_packs_epi32(u[10], u[11]);
+ out[11] = _mm_packs_epi32(u[12], u[13]);
+ out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+ &out[29], &out[3], &out[19],
+ &out[11], &out[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+#endif // FDCT32x32_HIGH_PRECISION
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output0 = &intermediate[column_start * 32];
+ tran_low_t *output1 = &output_org[column_start * 32];
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m128i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+ __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+ __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+ __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+ __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+ __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+ __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+ __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+ _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+ _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+ _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+ _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+ _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+ _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+ _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+ // Process next 8x8
+ output0 += 8;
+ } else {
+ storeu_output(&tr2_0, (output1 + 0 * 32));
+ storeu_output(&tr2_1, (output1 + 1 * 32));
+ storeu_output(&tr2_2, (output1 + 2 * 32));
+ storeu_output(&tr2_3, (output1 + 3 * 32));
+ storeu_output(&tr2_4, (output1 + 4 * 32));
+ storeu_output(&tr2_5, (output1 + 5 * 32));
+ storeu_output(&tr2_6, (output1 + 6 * 32));
+ storeu_output(&tr2_7, (output1 + 7 * 32));
+ // Process next 8x8
+ output1 += 8;
+ }
+ }
+ }
+ }
+ }
+} // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h b/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000..69889e2
--- /dev/null
+++ b/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h
@@ -0,0 +1,1027 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // This 2D transform implements 4 vertical 1D transforms followed
+ // by 4 horizontal 1D transforms. The multiplies and adds are as given
+ // by Chen, Smith and Fralick ('77). The commands for moving the data
+ // around have been minimized by hand.
+ // For the purposes of the comments, the 16 inputs are referred to at i0
+ // through iF (in raster order), intermediate variables are a0, b0, c0
+ // through f, and correspond to the in-place computations mapped to input
+ // locations. The outputs, o0 through oF are labeled according to the
+ // output locations.
+
+ // Constants
+ // These are the coefficients used for the multiplies.
+ // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+ // where cospi_N_64 = cos(N pi /64)
+ const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64,
+ cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64,
+ cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
+ cospi_8_64, cospi_24_64,
+ cospi_24_64, -cospi_8_64,
+ cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
+ cospi_24_64, -cospi_8_64,
+ cospi_8_64, cospi_24_64,
+ cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
+ cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64,
+ -cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
+ cospi_24_64, -cospi_8_64,
+ -cospi_24_64, cospi_8_64,
+ -cospi_24_64, cospi_8_64);
+
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // This second rounding constant saves doing some extra adds at the end
+ const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
+ +(DCT_CONST_ROUNDING << 1));
+ const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+ __m128i cmp0, cmp1;
+ int test, overflow;
+#endif
+
+ // Load inputs.
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+ (input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+ (input + 3 * stride)));
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+#if DCT_HIGH_BIT_DEPTH
+ // Check inputs small enough to use optimised code
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+ test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+ if (test) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // multiply by 16 to give some extra precision
+ in0 = _mm_slli_epi16(in0, 4);
+ in1 = _mm_slli_epi16(in1, 4);
+ // if (i == 0 && input[0]) input[0] += 1;
+ // add 1 to the upper left pixel if it is non-zero, which helps reduce
+ // the round-trip error
+ {
+ // The mask will only contain whether the first value is zero, all
+ // other comparison will fail as something shifted by 4 (above << 4)
+ // can never be equal to one. To increment in the non-zero case, we
+ // add the mask and one for the first element:
+ // - if zero, mask = -1, v = v - 1 + 1 = v
+ // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+ __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+ in0 = _mm_add_epi16(in0, mask);
+ in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+ }
+ // There are 4 total stages, alternating between an add/subtract stage
+ // followed by an multiply-and-add stage.
+ {
+ // Stage 1: Add/subtract
+
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+ const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+ // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+ // r1 = [iC i8 iD i9 iE iA iF iB]
+ const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+ const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+ // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+ // r3 = [iC i8 iD i9 iF iB iE iA]
+
+ const __m128i t0 = _mm_add_epi16(r2, r3);
+ const __m128i t1 = _mm_sub_epi16(r2, r3);
+ // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+ // t1 = [aC a8 aD a9 aF aB aE aA]
+
+ // Stage 2: multiply by constants (which gets us into 32 bits).
+ // The constants needed here are:
+ // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+ // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+ // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+ // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+ // Then add and right-shift to get back to 16-bit range
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // w0 = [b0 b1 b7 b6]
+ // w1 = [b8 b9 bF bE]
+ // w2 = [b4 b5 b3 b2]
+ // w3 = [bC bD bB bA]
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+ // x1 = [b4 b5 b3 b2 bC bD bB bA]
+ in0 = _mm_shuffle_epi32(x0, 0xD8);
+ in1 = _mm_shuffle_epi32(x1, 0x8D);
+ // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+ // in1 = [b3 b2 bB bA b4 b5 bC bD]
+ }
+ {
+ // vertical DCTs finished. Now we do the horizontal DCTs.
+ // Stage 3: Add/subtract
+
+ const __m128i t0 = ADD_EPI16(in0, in1);
+ const __m128i t1 = SUB_EPI16(in0, in1);
+ // t0 = [c0 c1 c8 c9 c4 c5 cC cD]
+ // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&t0, &t1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Stage 4: multiply by constants (which gets us into 32 bits).
+ {
+ // The constants needed here are:
+ // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+ // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+ // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+ // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+ const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+ const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+ // Then add and right-shift to get back to 16-bit range
+ // but this combines the final right-shift as well to save operations
+ // This unusual rounding operations is to maintain bit-accurate
+ // compatibility with the c version of this function which has two
+ // rounding steps in a row.
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+ // w0 = [o0 o4 o8 oC]
+ // w1 = [o2 o6 oA oE]
+ // w2 = [o1 o5 o9 oD]
+ // w3 = [o3 o7 oB oF]
+ // remember the o's are numbered according to the correct output location
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+ // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+ const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+ const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+ // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+ // y1 = [o2 o3 o6 o7 oA oB oE oF]
+ in0 = _mm_unpacklo_epi32(y0, y1);
+ // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+ in1 = _mm_unpackhi_epi32(y0, y1);
+ // in1 = [o8 o9 oA oB oC oD oE oF]
+ }
+ }
+ }
+ // Post-condition (v + 1) >> 2 is now incorporated into previous
+ // add and right-shift commands. Only 2 store instructions needed
+ // because we are using the fact that 1/3 are stored just after 0/2.
+ storeu_output(&in0, output + 0 * 4);
+ storeu_output(&in1, output + 2 * 4);
+}
+
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(in0, in7);
+ const __m128i q1 = ADD_EPI16(in1, in6);
+ const __m128i q2 = ADD_EPI16(in2, in5);
+ const __m128i q3 = ADD_EPI16(in3, in4);
+ const __m128i q4 = SUB_EPI16(in3, in4);
+ const __m128i q5 = SUB_EPI16(in2, in5);
+ const __m128i q6 = SUB_EPI16(in1, in6);
+ const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+ if (pass == 1) {
+ overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+ &q4, &q5, &q6, &q7);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ // store results
+ store_output(&in0, (output + 0 * 8));
+ store_output(&in1, (output + 1 * 8));
+ store_output(&in2, (output + 2 * 8));
+ store_output(&in3, (output + 3 * 8));
+ store_output(&in4, (output + 4 * 8));
+ store_output(&in5, (output + 5 * 8));
+ store_output(&in6, (output + 6 * 8));
+ store_output(&in7, (output + 7 * 8));
+ }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+ const int16_t *in = input;
+ int16_t *out0 = intermediate;
+ tran_low_t *out1 = output;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (column_start = 0; column_start < 16; column_start += 8) {
+ __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+ __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+ __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+ __m128i step1_0, step1_1, step1_2, step1_3;
+ __m128i step1_4, step1_5, step1_6, step1_7;
+ __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ __m128i step3_0, step3_1, step3_2, step3_3;
+ __m128i step3_4, step3_5, step3_6, step3_7;
+ __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+ __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+ // Load and pre-condition input.
+ if (0 == pass) {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
+ // x = x << 2
+ in00 = _mm_slli_epi16(in00, 2);
+ in01 = _mm_slli_epi16(in01, 2);
+ in02 = _mm_slli_epi16(in02, 2);
+ in03 = _mm_slli_epi16(in03, 2);
+ in04 = _mm_slli_epi16(in04, 2);
+ in05 = _mm_slli_epi16(in05, 2);
+ in06 = _mm_slli_epi16(in06, 2);
+ in07 = _mm_slli_epi16(in07, 2);
+ in08 = _mm_slli_epi16(in08, 2);
+ in09 = _mm_slli_epi16(in09, 2);
+ in10 = _mm_slli_epi16(in10, 2);
+ in11 = _mm_slli_epi16(in11, 2);
+ in12 = _mm_slli_epi16(in12, 2);
+ in13 = _mm_slli_epi16(in13, 2);
+ in14 = _mm_slli_epi16(in14, 2);
+ in15 = _mm_slli_epi16(in15, 2);
+ } else {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
+ // x = (x + 1) >> 2
+ in00 = _mm_add_epi16(in00, kOne);
+ in01 = _mm_add_epi16(in01, kOne);
+ in02 = _mm_add_epi16(in02, kOne);
+ in03 = _mm_add_epi16(in03, kOne);
+ in04 = _mm_add_epi16(in04, kOne);
+ in05 = _mm_add_epi16(in05, kOne);
+ in06 = _mm_add_epi16(in06, kOne);
+ in07 = _mm_add_epi16(in07, kOne);
+ in08 = _mm_add_epi16(in08, kOne);
+ in09 = _mm_add_epi16(in09, kOne);
+ in10 = _mm_add_epi16(in10, kOne);
+ in11 = _mm_add_epi16(in11, kOne);
+ in12 = _mm_add_epi16(in12, kOne);
+ in13 = _mm_add_epi16(in13, kOne);
+ in14 = _mm_add_epi16(in14, kOne);
+ in15 = _mm_add_epi16(in15, kOne);
+ in00 = _mm_srai_epi16(in00, 2);
+ in01 = _mm_srai_epi16(in01, 2);
+ in02 = _mm_srai_epi16(in02, 2);
+ in03 = _mm_srai_epi16(in03, 2);
+ in04 = _mm_srai_epi16(in04, 2);
+ in05 = _mm_srai_epi16(in05, 2);
+ in06 = _mm_srai_epi16(in06, 2);
+ in07 = _mm_srai_epi16(in07, 2);
+ in08 = _mm_srai_epi16(in08, 2);
+ in09 = _mm_srai_epi16(in09, 2);
+ in10 = _mm_srai_epi16(in10, 2);
+ in11 = _mm_srai_epi16(in11, 2);
+ in12 = _mm_srai_epi16(in12, 2);
+ in13 = _mm_srai_epi16(in13, 2);
+ in14 = _mm_srai_epi16(in14, 2);
+ in15 = _mm_srai_epi16(in15, 2);
+ }
+ in += 8;
+ // Calculate input for the first 8 results.
+ {
+ input0 = ADD_EPI16(in00, in15);
+ input1 = ADD_EPI16(in01, in14);
+ input2 = ADD_EPI16(in02, in13);
+ input3 = ADD_EPI16(in03, in12);
+ input4 = ADD_EPI16(in04, in11);
+ input5 = ADD_EPI16(in05, in10);
+ input6 = ADD_EPI16(in06, in09);
+ input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+ &input4, &input5, &input6, &input7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Calculate input for the next 8 results.
+ {
+ step1_0 = SUB_EPI16(in07, in08);
+ step1_1 = SUB_EPI16(in06, in09);
+ step1_2 = SUB_EPI16(in05, in10);
+ step1_3 = SUB_EPI16(in04, in11);
+ step1_4 = SUB_EPI16(in03, in12);
+ step1_5 = SUB_EPI16(in02, in13);
+ step1_6 = SUB_EPI16(in01, in14);
+ step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+ &step1_2, &step1_3,
+ &step1_4, &step1_5,
+ &step1_6, &step1_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(input0, input7);
+ const __m128i q1 = ADD_EPI16(input1, input6);
+ const __m128i q2 = ADD_EPI16(input2, input5);
+ const __m128i q3 = ADD_EPI16(input3, input4);
+ const __m128i q4 = SUB_EPI16(input3, input4);
+ const __m128i q5 = SUB_EPI16(input2, input5);
+ const __m128i q6 = SUB_EPI16(input1, input6);
+ const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+ &q4, &q5, &q6, &q7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING,
+ DCT_CONST_BITS);
+ const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING,
+ DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res02, &res14,
+ &res10, &res06);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
+ &step2_4);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 3
+ {
+ step3_0 = ADD_EPI16(step1_0, step2_3);
+ step3_1 = ADD_EPI16(step1_1, step2_2);
+ step3_2 = SUB_EPI16(step1_1, step2_2);
+ step3_3 = SUB_EPI16(step1_0, step2_3);
+ step3_4 = SUB_EPI16(step1_7, step2_4);
+ step3_5 = SUB_EPI16(step1_6, step2_5);
+ step3_6 = ADD_EPI16(step1_6, step2_5);
+ step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
+ &step3_2, &step3_3,
+ &step3_4, &step3_5,
+ &step3_6, &step3_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 4
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
+ &step2_5);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 5
+ {
+ step1_0 = ADD_EPI16(step3_0, step2_1);
+ step1_1 = SUB_EPI16(step3_0, step2_1);
+ step1_2 = ADD_EPI16(step3_3, step2_2);
+ step1_3 = SUB_EPI16(step3_3, step2_2);
+ step1_4 = SUB_EPI16(step3_4, step2_5);
+ step1_5 = ADD_EPI16(step3_4, step2_5);
+ step1_6 = SUB_EPI16(step3_7, step2_6);
+ step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+ &step1_2, &step1_3,
+ &step1_4, &step1_5,
+ &step1_6, &step1_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 6
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Transpose the results, do it as two 8x8 transposes.
+ transpose_and_output8x8(&res00, &res01, &res02, &res03,
+ &res04, &res05, &res06, &res07,
+ pass, out0, out1);
+ transpose_and_output8x8(&res08, &res09, &res10, &res11,
+ &res12, &res13, &res14, &res15,
+ pass, out0 + 8, out1 + 8);
+ if (pass == 0) {
+ out0 += 8*16;
+ } else {
+ out1 += 8*16;
+ }
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/vp10/common/x86/vp10_fwd_txfm_sse2.c b/vp10/common/x86/vp10_fwd_txfm_sse2.c
new file mode 100644
index 0000000..032c3cc
--- /dev/null
+++ b/vp10/common/x86/vp10_fwd_txfm_sse2.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+
+void vp10_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0, in1;
+ __m128i tmp;
+ const __m128i zero = _mm_setzero_si128();
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+ (input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+ (input + 3 * stride)));
+
+ tmp = _mm_add_epi16(in0, in1);
+ in0 = _mm_unpacklo_epi16(zero, tmp);
+ in1 = _mm_unpackhi_epi16(zero, tmp);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(tmp, zero);
+ in1 = _mm_unpackhi_epi32(tmp, zero);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(tmp, 8);
+
+ in1 = _mm_add_epi32(tmp, in0);
+ in0 = _mm_slli_epi32(in1, 1);
+ store_output(&in0, output);
+}
+
+void vp10_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i u0, u1, sum;
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(u0, u1);
+
+ in0 = _mm_add_epi16(in0, in1);
+ in2 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, in0);
+
+ u0 = _mm_setzero_si128();
+ sum = _mm_add_epi16(sum, in2);
+
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ store_output(&in1, output);
+}
+
+void vp10_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ input += 8 * i;
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 1);
+ store_output(&in1, output);
+}
+
+void vp10_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 3);
+ store_output(&in1, output);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D vp10_fdct4x4_sse2
+#define FDCT8x8_2D vp10_fdct8x8_sse2
+#define FDCT16x16_2D vp10_fdct16x16_sse2
+#include "vp10/common/x86/vp10_fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D vp10_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h"
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp10_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D vp10_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vp10_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vp10_highbd_fdct16x16_sse2
+#include "vp10/common/x86/vp10_fwd_txfm_impl_sse2.h" // NOLINT
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D vp10_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp10_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/x86/vp10_inv_txfm_sse2.c b/vp10/common/x86/vp10_inv_txfm_sse2.c
new file mode 100644
index 0000000..b25e22e
--- /dev/null
+++ b/vp10/common/x86/vp10_inv_txfm_sse2.c
@@ -0,0 +1,4058 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/x86/vp10_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+#define RECON_AND_STORE4X4(dest, in_x) \
+{ \
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ *(int *)(dest) = _mm_cvtsi128_si32(d0); \
+}
+
+void vp10_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i cst = _mm_setr_epi16(
+ (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+ (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+ (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i input0, input1, input2, input3;
+
+ // Rows
+ input0 = _mm_load_si128((const __m128i *)input);
+ input2 = _mm_load_si128((const __m128i *)(input + 8));
+
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ input0 = _mm_shufflelo_epi16(input0, 0xd8);
+ input0 = _mm_shufflehi_epi16(input0, 0xd8);
+ input2 = _mm_shufflelo_epi16(input2, 0xd8);
+ input2 = _mm_shufflehi_epi16(input2, 0xd8);
+
+ input1 = _mm_unpackhi_epi32(input0, input0);
+ input0 = _mm_unpacklo_epi32(input0, input0);
+ input3 = _mm_unpackhi_epi32(input2, input2);
+ input2 = _mm_unpacklo_epi32(input2, input2);
+
+ // Stage 1
+ input0 = _mm_madd_epi16(input0, cst);
+ input1 = _mm_madd_epi16(input1, cst);
+ input2 = _mm_madd_epi16(input2, cst);
+ input3 = _mm_madd_epi16(input3, cst);
+
+ input0 = _mm_add_epi32(input0, rounding);
+ input1 = _mm_add_epi32(input1, rounding);
+ input2 = _mm_add_epi32(input2, rounding);
+ input3 = _mm_add_epi32(input3, rounding);
+
+ input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+ input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+ input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+ input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+ // Stage 2
+ input0 = _mm_packs_epi32(input0, input1);
+ input1 = _mm_packs_epi32(input2, input3);
+
+ // Transpose
+ input2 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpackhi_epi16(input0, input1);
+ input0 = _mm_unpacklo_epi32(input2, input3);
+ input1 = _mm_unpackhi_epi32(input2, input3);
+
+ // Switch column2, column 3, and then, we got:
+ // input2: column1, column 0; input3: column2, column 3.
+ input1 = _mm_shuffle_epi32(input1, 0x4e);
+ input2 = _mm_add_epi16(input0, input1);
+ input3 = _mm_sub_epi16(input0, input1);
+
+ // Columns
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ input0 = _mm_unpacklo_epi32(input2, input2);
+ input1 = _mm_unpackhi_epi32(input2, input2);
+ input2 = _mm_unpackhi_epi32(input3, input3);
+ input3 = _mm_unpacklo_epi32(input3, input3);
+
+ // Stage 1
+ input0 = _mm_madd_epi16(input0, cst);
+ input1 = _mm_madd_epi16(input1, cst);
+ input2 = _mm_madd_epi16(input2, cst);
+ input3 = _mm_madd_epi16(input3, cst);
+
+ input0 = _mm_add_epi32(input0, rounding);
+ input1 = _mm_add_epi32(input1, rounding);
+ input2 = _mm_add_epi32(input2, rounding);
+ input3 = _mm_add_epi32(input3, rounding);
+
+ input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+ input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+ input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+ input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+ // Stage 2
+ input0 = _mm_packs_epi32(input0, input2);
+ input1 = _mm_packs_epi32(input1, input3);
+
+ // Transpose
+ input2 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpackhi_epi16(input0, input1);
+ input0 = _mm_unpacklo_epi32(input2, input3);
+ input1 = _mm_unpackhi_epi32(input2, input3);
+
+ // Switch column2, column 3, and then, we got:
+ // input2: column1, column 0; input3: column2, column 3.
+ input1 = _mm_shuffle_epi32(input1, 0x4e);
+ input2 = _mm_add_epi16(input0, input1);
+ input3 = _mm_sub_epi16(input0, input1);
+
+ // Final round and shift
+ input2 = _mm_add_epi16(input2, eight);
+ input3 = _mm_add_epi16(input3, eight);
+
+ input2 = _mm_srai_epi16(input2, 4);
+ input3 = _mm_srai_epi16(input3, 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi32(d0,
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d2 = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d2 = _mm_unpacklo_epi8(d2, zero);
+ d0 = _mm_add_epi16(d0, input2);
+ d2 = _mm_add_epi16(d2, input3);
+ d0 = _mm_packus_epi16(d0, d2);
+ // store input0
+ *(int *)dest = _mm_cvtsi128_si32(d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+ // store input2
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+ // store input3
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+ }
+}
+
+void vp10_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 4);
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+ res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+void vp10_idct4_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+
+ transpose_4x4(in);
+ // stage 1
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+ u[0] = _mm_packs_epi32(v[0], v[1]);
+ u[1] = _mm_packs_epi32(v[3], v[2]);
+
+ // stage 2
+ in[0] = _mm_add_epi16(u[0], u[1]);
+ in[1] = _mm_sub_epi16(u[0], u[1]);
+ in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+void vp10_iadst4_sse2(__m128i *in) {
+ const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+ const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8], in7;
+
+ transpose_4x4(in);
+ in7 = _mm_srli_si128(in[1], 8);
+ in7 = _mm_add_epi16(in7, in[0]);
+ in7 = _mm_sub_epi16(in7, in[1]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpackhi_epi16(in[0], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(v[3], v[4]);
+ u[2] = v[2];
+ u[3] = _mm_add_epi32(u[0], u[1]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_add_epi32(u[3], v[5]);
+ u[6] = _mm_sub_epi32(u[5], u[4]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+ out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+ out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+ out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+ }
+
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
+ out0, out1, out2, out3) \
+ { \
+ const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+ const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+ cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ tmp4 = _mm_madd_epi16(lo_1, cst2); \
+ tmp5 = _mm_madd_epi16(hi_1, cst2); \
+ tmp6 = _mm_madd_epi16(lo_1, cst3); \
+ tmp7 = _mm_madd_epi16(hi_1, cst3); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp4 = _mm_add_epi32(tmp4, rounding); \
+ tmp5 = _mm_add_epi32(tmp5, rounding); \
+ tmp6 = _mm_add_epi32(tmp6, rounding); \
+ tmp7 = _mm_add_epi32(tmp7, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ res2 = _mm_packs_epi32(tmp4, tmp5); \
+ res3 = _mm_packs_epi32(tmp6, tmp7); \
+ }
+
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ }
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_4, \
+ stp1_7, stp1_5, stp1_6) \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+ const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_0, \
+ stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+ tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+ tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ } \
+ \
+ /* Stage4 */ \
+ out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+ out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+ out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+ out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+ out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+ out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+ out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+ out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+ }
+
+void vp10_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data.
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ // 8x8 Transpose is copied from vp10_fdct8x8_sse2()
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ // 4-stage 1D vp10_idct8x8
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ }
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void vp10_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 5);
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE(dest + 0 * stride, dc_value);
+ RECON_AND_STORE(dest + 1 * stride, dc_value);
+ RECON_AND_STORE(dest + 2 * stride, dc_value);
+ RECON_AND_STORE(dest + 3 * stride, dc_value);
+ RECON_AND_STORE(dest + 4 * stride, dc_value);
+ RECON_AND_STORE(dest + 5 * stride, dc_value);
+ RECON_AND_STORE(dest + 6 * stride, dc_value);
+ RECON_AND_STORE(dest + 7 * stride, dc_value);
+}
+
+void vp10_idct8_sse2(__m128i *in) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ // 8x8 Transpose is copied from vp10_fdct8x8_sse2()
+ TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ // 4-stage 1D vp10_idct8x8
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+}
+
+void vp10_iadst8_sse2(__m128i *in) {
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // transpose
+ array_transpose_8x8(in, in);
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[1] = _mm_packs_epi32(u2, u3);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[3] = _mm_packs_epi32(u6, u7);
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_add_epi16(in[0], in[2]);
+ s1 = _mm_add_epi16(in[1], in[3]);
+ s2 = _mm_sub_epi16(in[0], in[2]);
+ s3 = _mm_sub_epi16(in[1], in[3]);
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+void vp10_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ // Rows. Load 4-row input data.
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+
+ // 8x4 Transpose
+ TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+ // Stage1
+ {
+ const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
+ const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
+
+ tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+ tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+ tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+ tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+ stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+ }
+
+ // Stage2
+ {
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
+ const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
+
+ tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+ tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+ tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+ tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp2_0 = _mm_packs_epi32(tmp0, tmp2);
+ stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+
+ tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
+ tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+
+ stp2_4 = tmp0;
+ stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+
+ tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
+ tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+
+ stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
+ stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+
+ tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+ }
+
+ // Stage4
+ tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
+ tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
+ tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
+ tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+
+ TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+ IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+#define IDCT16 \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+ const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
+ const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
+ const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+ const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+ stg2_0, stg2_1, stg2_2, stg2_3, \
+ stp2_8, stp2_15, stp2_9, stp2_14) \
+ \
+ MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+ stg2_4, stg2_5, stg2_6, stg2_7, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+ const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+ const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+ stg3_0, stg3_1, stg3_2, stg3_3, \
+ stp1_4, stp1_7, stp1_5, stp1_6) \
+ \
+ stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ \
+ stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+ const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+ const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+ stg4_0, stg4_1, stg4_2, stg4_3, \
+ stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+ stg4_4, stg4_5, stg4_6, stg4_7, \
+ stp2_9, stp2_14, stp2_10, stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ }
+
+#define IDCT16_10 \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+ stg2_0, stg2_1, stg2_6, stg2_7, \
+ stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+ stg3_0, stg3_1, \
+ stp2_4, stp2_7) \
+ \
+ stp1_9 = stp1_8_0; \
+ stp1_10 = stp1_11; \
+ \
+ stp1_13 = stp1_12_0; \
+ stp1_14 = stp1_15; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+ stg4_0, stg4_1, \
+ stp1_0, stp1_1) \
+ stp2_5 = stp2_4; \
+ stp2_6 = stp2_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+ stg4_4, stg4_5, stg4_6, stg4_7, \
+ stp2_9, stp2_14, stp2_10, stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_2 = stp1_1; \
+ stp1_3 = stp1_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ }
+
+void vp10_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[16], l[16], r[16], *curr1;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ curr1 = l;
+ for (i = 0; i < 2; i++) {
+ // 1-D vp10_idct
+
+ // Load input data.
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+ in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+ in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+ in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+ in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+ in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+ in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+ in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+
+ IDCT16
+
+ // Stage7
+ curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+ curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+ curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+ curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+ curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+ curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+ curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+ curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+ curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ curr1 = r;
+ input += 128;
+ }
+ for (i = 0; i < 2; i++) {
+ int j;
+ // 1-D vp10_idct
+ array_transpose_8x8(l + i * 8, in);
+ array_transpose_8x8(r + i * 8, in + 8);
+
+ IDCT16
+
+ // 2-D
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ for (j = 0; j < 16; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vp10_idct16x16_1_add_sse2(const int16_t *input,
+ uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, i;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (i = 0; i < 2; ++i) {
+ RECON_AND_STORE(dest + 0 * stride, dc_value);
+ RECON_AND_STORE(dest + 1 * stride, dc_value);
+ RECON_AND_STORE(dest + 2 * stride, dc_value);
+ RECON_AND_STORE(dest + 3 * stride, dc_value);
+ RECON_AND_STORE(dest + 4 * stride, dc_value);
+ RECON_AND_STORE(dest + 5 * stride, dc_value);
+ RECON_AND_STORE(dest + 6 * stride, dc_value);
+ RECON_AND_STORE(dest + 7 * stride, dc_value);
+ RECON_AND_STORE(dest + 8 * stride, dc_value);
+ RECON_AND_STORE(dest + 9 * stride, dc_value);
+ RECON_AND_STORE(dest + 10 * stride, dc_value);
+ RECON_AND_STORE(dest + 11 * stride, dc_value);
+ RECON_AND_STORE(dest + 12 * stride, dc_value);
+ RECON_AND_STORE(dest + 13 * stride, dc_value);
+ RECON_AND_STORE(dest + 14 * stride, dc_value);
+ RECON_AND_STORE(dest + 15 * stride, dc_value);
+ dest += 8;
+ }
+}
+
+static void vp10_iadst16_8col(__m128i *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+ v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+ v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+ v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+ v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+ v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+ v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+ v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+ v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+ v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+ u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+ u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+ u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+ u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+ u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+ u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+ u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[4] = _mm_packs_epi32(v[4], v[5]);
+ in[5] = _mm_packs_epi32(v[12], v[13]);
+ in[6] = _mm_packs_epi32(v[8], v[9]);
+ in[7] = _mm_packs_epi32(v[0], v[1]);
+ in[8] = _mm_packs_epi32(v[2], v[3]);
+ in[9] = _mm_packs_epi32(v[10], v[11]);
+ in[10] = _mm_packs_epi32(v[14], v[15]);
+ in[11] = _mm_packs_epi32(v[6], v[7]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void vp10_idct16_8col(__m128i *in) {
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i v[16], u[16], s[16], t[16];
+
+ // stage 1
+ s[0] = in[0];
+ s[1] = in[8];
+ s[2] = in[4];
+ s[3] = in[12];
+ s[4] = in[2];
+ s[5] = in[10];
+ s[6] = in[6];
+ s[7] = in[14];
+ s[8] = in[1];
+ s[9] = in[9];
+ s[10] = in[5];
+ s[11] = in[13];
+ s[12] = in[3];
+ s[13] = in[11];
+ s[14] = in[7];
+ s[15] = in[15];
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+ u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+ u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+ u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[8] = _mm_packs_epi32(u[0], u[1]);
+ s[15] = _mm_packs_epi32(u[2], u[3]);
+ s[9] = _mm_packs_epi32(u[4], u[5]);
+ s[14] = _mm_packs_epi32(u[6], u[7]);
+ s[10] = _mm_packs_epi32(u[8], u[9]);
+ s[13] = _mm_packs_epi32(u[10], u[11]);
+ s[11] = _mm_packs_epi32(u[12], u[13]);
+ s[12] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ t[0] = s[0];
+ t[1] = s[1];
+ t[2] = s[2];
+ t[3] = s[3];
+ u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+ u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+ u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+ u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[4] = _mm_packs_epi32(u[0], u[1]);
+ t[7] = _mm_packs_epi32(u[2], u[3]);
+ t[5] = _mm_packs_epi32(u[4], u[5]);
+ t[6] = _mm_packs_epi32(u[6], u[7]);
+ t[8] = _mm_add_epi16(s[8], s[9]);
+ t[9] = _mm_sub_epi16(s[8], s[9]);
+ t[10] = _mm_sub_epi16(s[11], s[10]);
+ t[11] = _mm_add_epi16(s[10], s[11]);
+ t[12] = _mm_add_epi16(s[12], s[13]);
+ t[13] = _mm_sub_epi16(s[12], s[13]);
+ t[14] = _mm_sub_epi16(s[15], s[14]);
+ t[15] = _mm_add_epi16(s[14], s[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+ u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+ u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+ u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+ u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+ u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+ u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_add_epi16(t[4], t[5]);
+ s[5] = _mm_sub_epi16(t[4], t[5]);
+ s[6] = _mm_sub_epi16(t[7], t[6]);
+ s[7] = _mm_add_epi16(t[6], t[7]);
+ s[8] = t[8];
+ s[15] = t[15];
+ s[9] = _mm_packs_epi32(u[8], u[9]);
+ s[14] = _mm_packs_epi32(u[10], u[11]);
+ s[10] = _mm_packs_epi32(u[12], u[13]);
+ s[13] = _mm_packs_epi32(u[14], u[15]);
+ s[11] = t[11];
+ s[12] = t[12];
+
+ // stage 5
+ t[0] = _mm_add_epi16(s[0], s[3]);
+ t[1] = _mm_add_epi16(s[1], s[2]);
+ t[2] = _mm_sub_epi16(s[1], s[2]);
+ t[3] = _mm_sub_epi16(s[0], s[3]);
+ t[4] = s[4];
+ t[7] = s[7];
+
+ u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+ u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ t[5] = _mm_packs_epi32(u[0], u[1]);
+ t[6] = _mm_packs_epi32(u[2], u[3]);
+
+ t[8] = _mm_add_epi16(s[8], s[11]);
+ t[9] = _mm_add_epi16(s[9], s[10]);
+ t[10] = _mm_sub_epi16(s[9], s[10]);
+ t[11] = _mm_sub_epi16(s[8], s[11]);
+ t[12] = _mm_sub_epi16(s[15], s[12]);
+ t[13] = _mm_sub_epi16(s[14], s[13]);
+ t[14] = _mm_add_epi16(s[13], s[14]);
+ t[15] = _mm_add_epi16(s[12], s[15]);
+
+ // stage 6
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ s[8] = t[8];
+ s[9] = t[9];
+
+ u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+ u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+ u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ s[10] = _mm_packs_epi32(u[0], u[1]);
+ s[13] = _mm_packs_epi32(u[2], u[3]);
+ s[11] = _mm_packs_epi32(u[4], u[5]);
+ s[12] = _mm_packs_epi32(u[6], u[7]);
+ s[14] = t[14];
+ s[15] = t[15];
+
+ // stage 7
+ in[0] = _mm_add_epi16(s[0], s[15]);
+ in[1] = _mm_add_epi16(s[1], s[14]);
+ in[2] = _mm_add_epi16(s[2], s[13]);
+ in[3] = _mm_add_epi16(s[3], s[12]);
+ in[4] = _mm_add_epi16(s[4], s[11]);
+ in[5] = _mm_add_epi16(s[5], s[10]);
+ in[6] = _mm_add_epi16(s[6], s[9]);
+ in[7] = _mm_add_epi16(s[7], s[8]);
+ in[8] = _mm_sub_epi16(s[7], s[8]);
+ in[9] = _mm_sub_epi16(s[6], s[9]);
+ in[10] = _mm_sub_epi16(s[5], s[10]);
+ in[11] = _mm_sub_epi16(s[4], s[11]);
+ in[12] = _mm_sub_epi16(s[3], s[12]);
+ in[13] = _mm_sub_epi16(s[2], s[13]);
+ in[14] = _mm_sub_epi16(s[1], s[14]);
+ in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+void vp10_idct16_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ vp10_idct16_8col(in0);
+ vp10_idct16_8col(in1);
+}
+
+void vp10_iadst16_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ vp10_iadst16_8col(in0);
+ vp10_iadst16_8col(in1);
+}
+
+void vp10_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i in[16], l[16];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+ // First 1-D inverse DCT
+ // Load input data.
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+
+ TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+ // Stage2
+ {
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
+
+ tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+ tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+ tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+ tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+ stp2_8 = _mm_packs_epi32(tmp0, tmp2);
+ stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+
+ tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+ tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+ stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+ }
+
+ // Stage4
+ {
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+ tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+ tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+ tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+ tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+ tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+ tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+ stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+ stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+ stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+ stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+ stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+ }
+
+ // Stage5 and Stage6
+ {
+ tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+ tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+ tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+ tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+ stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
+ stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+ stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
+ stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+ stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+ stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+ stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+ stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+ }
+
+ // Stage6
+ {
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+ tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+ tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+ tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+ tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+ tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+ tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
+ stp2_10 = _mm_packs_epi32(tmp0, zero);
+ stp2_13 = _mm_packs_epi32(tmp2, zero);
+ stp2_11 = _mm_packs_epi32(tmp4, zero);
+ stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+ tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+ tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+ tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+ tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+ stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+ stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+ stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+ stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+ stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+ stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+ stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+ }
+
+ // Stage7. Left 8x16 only.
+ l[0] = _mm_add_epi16(stp2_0, stp1_15);
+ l[1] = _mm_add_epi16(stp2_1, stp1_14);
+ l[2] = _mm_add_epi16(stp2_2, stp2_13);
+ l[3] = _mm_add_epi16(stp2_3, stp2_12);
+ l[4] = _mm_add_epi16(stp2_4, stp2_11);
+ l[5] = _mm_add_epi16(stp2_5, stp2_10);
+ l[6] = _mm_add_epi16(stp2_6, stp1_9);
+ l[7] = _mm_add_epi16(stp2_7, stp1_8);
+ l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ // Second 1-D inverse transform, performed per 8x16 block
+ for (i = 0; i < 2; i++) {
+ int j;
+ array_transpose_4X8(l + 8 * i, in);
+
+ IDCT16_10
+
+ // Stage7
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ for (j = 0; j < 16; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+#define LOAD_DQCOEFF(reg, input) \
+ { \
+ reg = _mm_load_si128((const __m128i *) input); \
+ input += 8; \
+ } \
+
+#define IDCT32_34 \
+/* Stage1 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+ \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+ \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+ stg1_1, stp1_16, stp1_31); \
+ MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+ stg1_7, stp1_19, stp1_28); \
+ MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+ stg1_9, stp1_20, stp1_27); \
+ MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+ stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+ stg2_1, stp2_8, stp2_15); \
+ MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+ stg2_7, stp2_11, stp2_12); \
+ \
+ stp2_16 = stp1_16; \
+ stp2_19 = stp1_19; \
+ \
+ stp2_20 = stp1_20; \
+ stp2_23 = stp1_23; \
+ \
+ stp2_24 = stp1_24; \
+ stp2_27 = stp1_27; \
+ \
+ stp2_28 = stp1_28; \
+ stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+ stg3_1, stp1_4, stp1_7); \
+ \
+ stp1_8 = stp2_8; \
+ stp1_11 = stp2_11; \
+ stp1_12 = stp2_12; \
+ stp1_15 = stp2_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+ stp1_18, stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+ stp1_22, stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+ stg4_1, stp2_0, stp2_1); \
+ \
+ stp2_4 = stp1_4; \
+ stp2_5 = stp1_4; \
+ stp2_6 = stp1_7; \
+ stp2_7 = stp1_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+ stp2_10, stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = stp2_0; \
+ stp1_1 = stp2_1; \
+ stp1_2 = stp2_1; \
+ stp1_3 = stp2_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+ stp2_13, stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+}
+
+
+#define IDCT32 \
+/* Stage1 */ \
+{ \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+ \
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+ \
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+ stp1_17, stp1_30) \
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+ stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+ stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
+ \
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+ stp2_14) \
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+ stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+ stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+ \
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+ \
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ \
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+ stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+ stp1_6) \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+ stp1_18, stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+ stp1_22, stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+ stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+ stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+ stp2_10, stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+ stp2_13, stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp10_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+ // vp10_idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[32], col[32];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+ stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+ stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+ stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+ stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[1] = _mm_load_si128((const __m128i *)(input + 32));
+ in[2] = _mm_load_si128((const __m128i *)(input + 64));
+ in[3] = _mm_load_si128((const __m128i *)(input + 96));
+ in[4] = _mm_load_si128((const __m128i *)(input + 128));
+ in[5] = _mm_load_si128((const __m128i *)(input + 160));
+ in[6] = _mm_load_si128((const __m128i *)(input + 192));
+ in[7] = _mm_load_si128((const __m128i *)(input + 224));
+
+ for (i = 8; i < 32; ++i) {
+ in[i] = _mm_setzero_si128();
+ }
+
+ array_transpose_8x8(in, in);
+ // TODO(hkuang): Following transposes are unnecessary. But remove them will
+ // lead to performance drop on some devices.
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ IDCT32_34
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+ for (i = 0; i < 4; i++) {
+ int j;
+ const __m128i zero = _mm_setzero_si128();
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + i * 8, in);
+ IDCT32_34
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vp10_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ // vp10_idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[32], col[128], zero_idx[16];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+ stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+ stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+ stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+ stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i, j, i32;
+
+ for (i = 0; i < 4; i++) {
+ i32 = (i << 5);
+ // First 1-D vp10_idct
+ // Load input data.
+ LOAD_DQCOEFF(in[0], input);
+ LOAD_DQCOEFF(in[8], input);
+ LOAD_DQCOEFF(in[16], input);
+ LOAD_DQCOEFF(in[24], input);
+ LOAD_DQCOEFF(in[1], input);
+ LOAD_DQCOEFF(in[9], input);
+ LOAD_DQCOEFF(in[17], input);
+ LOAD_DQCOEFF(in[25], input);
+ LOAD_DQCOEFF(in[2], input);
+ LOAD_DQCOEFF(in[10], input);
+ LOAD_DQCOEFF(in[18], input);
+ LOAD_DQCOEFF(in[26], input);
+ LOAD_DQCOEFF(in[3], input);
+ LOAD_DQCOEFF(in[11], input);
+ LOAD_DQCOEFF(in[19], input);
+ LOAD_DQCOEFF(in[27], input);
+
+ LOAD_DQCOEFF(in[4], input);
+ LOAD_DQCOEFF(in[12], input);
+ LOAD_DQCOEFF(in[20], input);
+ LOAD_DQCOEFF(in[28], input);
+ LOAD_DQCOEFF(in[5], input);
+ LOAD_DQCOEFF(in[13], input);
+ LOAD_DQCOEFF(in[21], input);
+ LOAD_DQCOEFF(in[29], input);
+ LOAD_DQCOEFF(in[6], input);
+ LOAD_DQCOEFF(in[14], input);
+ LOAD_DQCOEFF(in[22], input);
+ LOAD_DQCOEFF(in[30], input);
+ LOAD_DQCOEFF(in[7], input);
+ LOAD_DQCOEFF(in[15], input);
+ LOAD_DQCOEFF(in[23], input);
+ LOAD_DQCOEFF(in[31], input);
+
+ // checking if all entries are zero
+ zero_idx[0] = _mm_or_si128(in[0], in[1]);
+ zero_idx[1] = _mm_or_si128(in[2], in[3]);
+ zero_idx[2] = _mm_or_si128(in[4], in[5]);
+ zero_idx[3] = _mm_or_si128(in[6], in[7]);
+ zero_idx[4] = _mm_or_si128(in[8], in[9]);
+ zero_idx[5] = _mm_or_si128(in[10], in[11]);
+ zero_idx[6] = _mm_or_si128(in[12], in[13]);
+ zero_idx[7] = _mm_or_si128(in[14], in[15]);
+ zero_idx[8] = _mm_or_si128(in[16], in[17]);
+ zero_idx[9] = _mm_or_si128(in[18], in[19]);
+ zero_idx[10] = _mm_or_si128(in[20], in[21]);
+ zero_idx[11] = _mm_or_si128(in[22], in[23]);
+ zero_idx[12] = _mm_or_si128(in[24], in[25]);
+ zero_idx[13] = _mm_or_si128(in[26], in[27]);
+ zero_idx[14] = _mm_or_si128(in[28], in[29]);
+ zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+ zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+ zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+ zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+ if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+ col[i32 + 0] = _mm_setzero_si128();
+ col[i32 + 1] = _mm_setzero_si128();
+ col[i32 + 2] = _mm_setzero_si128();
+ col[i32 + 3] = _mm_setzero_si128();
+ col[i32 + 4] = _mm_setzero_si128();
+ col[i32 + 5] = _mm_setzero_si128();
+ col[i32 + 6] = _mm_setzero_si128();
+ col[i32 + 7] = _mm_setzero_si128();
+ col[i32 + 8] = _mm_setzero_si128();
+ col[i32 + 9] = _mm_setzero_si128();
+ col[i32 + 10] = _mm_setzero_si128();
+ col[i32 + 11] = _mm_setzero_si128();
+ col[i32 + 12] = _mm_setzero_si128();
+ col[i32 + 13] = _mm_setzero_si128();
+ col[i32 + 14] = _mm_setzero_si128();
+ col[i32 + 15] = _mm_setzero_si128();
+ col[i32 + 16] = _mm_setzero_si128();
+ col[i32 + 17] = _mm_setzero_si128();
+ col[i32 + 18] = _mm_setzero_si128();
+ col[i32 + 19] = _mm_setzero_si128();
+ col[i32 + 20] = _mm_setzero_si128();
+ col[i32 + 21] = _mm_setzero_si128();
+ col[i32 + 22] = _mm_setzero_si128();
+ col[i32 + 23] = _mm_setzero_si128();
+ col[i32 + 24] = _mm_setzero_si128();
+ col[i32 + 25] = _mm_setzero_si128();
+ col[i32 + 26] = _mm_setzero_si128();
+ col[i32 + 27] = _mm_setzero_si128();
+ col[i32 + 28] = _mm_setzero_si128();
+ col[i32 + 29] = _mm_setzero_si128();
+ col[i32 + 30] = _mm_setzero_si128();
+ col[i32 + 31] = _mm_setzero_si128();
+ continue;
+ }
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ IDCT32
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ }
+ for (i = 0; i < 4; i++) {
+ // Second 1-D vp10_idct
+ j = i << 3;
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + j, in);
+ array_transpose_8x8(col + j + 32, in + 8);
+ array_transpose_8x8(col + j + 64, in + 16);
+ array_transpose_8x8(col + j + 96, in + 24);
+
+ IDCT32
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vp10_idct32x32_1_add_sse2(const int16_t *input,
+ uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, i;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (i = 0; i < 4; ++i) {
+ int j;
+ for (j = 0; j < 32; ++j) {
+ RECON_AND_STORE(dest + j * stride, dc_value);
+ }
+ dest += 8;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+ __m128i ubounded, retval;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+ ubounded = _mm_cmpgt_epi16(value, max);
+ retval = _mm_andnot_si128(ubounded, value);
+ ubounded = _mm_and_si128(ubounded, max);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+ return retval;
+}
+
+void vp10_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ __m128i inptr[4];
+ __m128i sign_bits[2];
+ __m128i temp_mm, min_input, max_input;
+ int test;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int optimised_cols = 0;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i max = _mm_set1_epi16(12043);
+ const __m128i min = _mm_set1_epi16(-12043);
+ // Load input into __m128i
+ inptr[0] = _mm_loadu_si128((const __m128i *)input);
+ inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+ inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+ inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+ // Pack to 16 bits
+ inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+ inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (!test) {
+ // Do the row transform
+ vp10_idct4_sse2(inptr);
+
+ // Check the min & max values
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (test) {
+ transpose_4x4(inptr);
+ sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+ sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+ inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+ inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+ inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+ inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+ _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+ _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+ _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+ _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct4_c(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct4_sse2(inptr);
+
+ // Final round and shift
+ inptr[0] = _mm_add_epi16(inptr[0], eight);
+ inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+ inptr[0] = _mm_srai_epi16(inptr[0], 4);
+ inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+ __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi64(
+ d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+ d2 = _mm_unpacklo_epi64(
+ d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+ d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+ d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+ // store input0
+ _mm_storel_epi64((__m128i *)dest, d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride), d0);
+ // store input2
+ _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+ // store input3
+ d2 = _mm_srli_si128(d2, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[4], temp_out[4];
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ vp10_highbd_idct4_c(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+ }
+}
+
+void vp10_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[8];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ const __m128i max = _mm_set1_epi16(6201);
+ const __m128i min = _mm_set1_epi16(-6201);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 8; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ vp10_idct8_sse2(inptr);
+
+ // Find the min & max for the column transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ array_transpose_8x8(inptr, inptr);
+ for (i = 0; i < 8; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 8; ++i) {
+ vp10_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct8_sse2(inptr);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[8];
+ for (i = 0; i < 8; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ inptr[i] = _mm_srai_epi16(inptr[i], 5);
+ d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[8], temp_out[8];
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+ }
+}
+
+void vp10_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[8];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ const __m128i max = _mm_set1_epi16(6201);
+ const __m128i min = _mm_set1_epi16(-6201);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 8; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ // only first 4 row has non-zero coefs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 4; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ vp10_idct8_sse2(inptr);
+
+ // Find the min & max for the column transform
+ // N.B. Only first 4 cols contain non-zero coeffs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ // Use fact only first 4 rows contain non-zero coeffs
+ array_transpose_4X8(inptr, inptr);
+ for (i = 0; i < 4; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct8_sse2(inptr);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[8];
+ for (i = 0; i < 8; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ inptr[i] = _mm_srai_epi16(inptr[i], 5);
+ d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[8], temp_out[8];
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+ }
+}
+
+void vp10_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[32];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_set1_epi16(32);
+ const __m128i max = _mm_set1_epi16(3155);
+ const __m128i min = _mm_set1_epi16(-3155);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 16; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+ inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 32; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ vp10_idct16_sse2(inptr, inptr + 16);
+
+ // Find the min & max for the column transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 32; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ array_transpose_16x16(inptr, inptr + 16);
+ for (i = 0; i < 16; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 16; ++i) {
+ vp10_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct16_sse2(inptr, inptr + 16);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[2];
+ for (i = 0; i < 16; i++) {
+ inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
+ inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+ inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
+ inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[16], temp_out[16];
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp10_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+ }
+}
+
+void vp10_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[32];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_set1_epi16(32);
+ const __m128i max = _mm_set1_epi16(3155);
+ const __m128i min = _mm_set1_epi16(-3155);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 16; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+ inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ // Since all non-zero dct coefficients are in upper-left 4x4 area,
+ // we only need to consider first 4 rows here.
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 4; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform (N.B. This transposes inptr)
+ vp10_idct16_sse2(inptr, inptr + 16);
+
+ // Find the min & max for the column transform
+ // N.B. Only first 4 cols contain non-zero coeffs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 16; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ // Use fact only first 4 rows contain non-zero coeffs
+ array_transpose_8x8(inptr, inptr);
+ array_transpose_8x8(inptr + 8, inptr + 16);
+ for (i = 0; i < 4; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct16_sse2(inptr, inptr + 16);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[2];
+ for (i = 0; i < 16; i++) {
+ inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
+ inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+ inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
+ inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[16], temp_out[16];
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp10_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/x86/vp10_inv_txfm_sse2.h b/vp10/common/x86/vp10_inv_txfm_sse2.h
new file mode 100644
index 0000000..b79781a
--- /dev/null
+++ b/vp10/common/x86/vp10_inv_txfm_sse2.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ \
+ in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
+ in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
+ }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+ out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+ __m128i tbuf[8];
+ array_transpose_8x8(res0, res0);
+ array_transpose_8x8(res1, tbuf);
+ array_transpose_8x8(res0 + 8, res1);
+ array_transpose_8x8(res1 + 8, res1 + 8);
+
+ res0[8] = tbuf[0];
+ res0[9] = tbuf[1];
+ res0[10] = tbuf[2];
+ res0[11] = tbuf[3];
+ res0[12] = tbuf[4];
+ res0[13] = tbuf[5];
+ res0[14] = tbuf[6];
+ res0[15] = tbuf[7];
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
+ in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
+ in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
+ in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
+ in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
+ in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
+ in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
+ in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
+}
+
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ _mm_storel_epi64((__m128i *)(dest), d0); \
+ }
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i zero = _mm_setzero_si128();
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+ RECON_AND_STORE(dest + 4 * stride, in[4]);
+ RECON_AND_STORE(dest + 5 * stride, in[5]);
+ RECON_AND_STORE(dest + 6 * stride, in[6]);
+ RECON_AND_STORE(dest + 7 * stride, in[7]);
+ RECON_AND_STORE(dest + 8 * stride, in[8]);
+ RECON_AND_STORE(dest + 9 * stride, in[9]);
+ RECON_AND_STORE(dest + 10 * stride, in[10]);
+ RECON_AND_STORE(dest + 11 * stride, in[11]);
+ RECON_AND_STORE(dest + 12 * stride, in[12]);
+ RECON_AND_STORE(dest + 13 * stride, in[13]);
+ RECON_AND_STORE(dest + 14 * stride, in[14]);
+ RECON_AND_STORE(dest + 15 * stride, in[15]);
+}
+
+void idct4_sse2(__m128i *in);
+void idct8_sse2(__m128i *in);
+void idct16_sse2(__m128i *in0, __m128i *in1);
+void iadst4_sse2(__m128i *in);
+void iadst8_sse2(__m128i *in);
+void iadst16_sse2(__m128i *in0, __m128i *in1);
+
+#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index 7008f33..cd3baa7 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -115,7 +115,6 @@
// indicate if it is in the rd search loop or encoding process
int use_lp32x32fdct;
- int skip_encode;
// use fast quantization process
int quant_fp;
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 65e9045..ddea8cd 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -2585,20 +2585,6 @@
return cpi->common.tx_mode;
}
-static int get_skip_encode_frame(const VP10_COMMON *cm, ThreadData *const td) {
- unsigned int intra_count = 0, inter_count = 0;
- int j;
-
- for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
- intra_count += td->counts->intra_inter[j][0];
- inter_count += td->counts->intra_inter[j][1];
- }
-
- return (intra_count << 2) < inter_count &&
- cm->frame_type != KEY_FRAME &&
- cm->show_frame;
-}
-
void vp10_init_tile_data(VP10_COMP *cpi) {
VP10_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
@@ -2690,7 +2676,6 @@
#endif
static void encode_frame_internal(VP10_COMP *cpi) {
- SPEED_FEATURES *const sf = &cpi->sf;
ThreadData *const td = &cpi->td;
MACROBLOCK *const x = &td->mb;
VP10_COMMON *const cm = &cpi->common;
@@ -2766,9 +2751,6 @@
cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
}
- sf->skip_encode_frame = sf->skip_encode_sb ?
- get_skip_encode_frame(cm, td) : 0;
-
#if 0
// Keep record of the total distortion this time around for future use
cpi->last_frame_distortion = cpi->frame_distortion;
@@ -2962,11 +2944,6 @@
x->skip_optimize = ctx->is_coded;
ctx->is_coded = 1;
x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
- x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
- x->q_index < QIDX_SKIP_THRESH);
-
- if (x->skip_encode)
- return;
if (!is_inter_block(mbmi)) {
int plane;
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index fbc9848..e2c038a 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -797,7 +797,7 @@
if (p->eobs[block])
*(args->skip) = 0;
- if (x->skip_encode || p->eobs[block] == 0)
+ if (p->eobs[block] == 0)
return;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -945,8 +945,7 @@
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
- vp10_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
- x->skip_encode ? src_stride : dst_stride,
+ vp10_predict_intra_block(xd, bwl, tx_size, mode, dst, dst_stride,
dst, dst_stride, i, j, plane);
#if CONFIG_VP9_HIGHBITDEPTH
@@ -1036,7 +1035,7 @@
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
- if (!x->skip_encode && *eob)
+ if (*eob)
vp10_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, tx_type);
break;
case TX_16X16:
@@ -1049,7 +1048,7 @@
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
- if (!x->skip_encode && *eob)
+ if (*eob)
vp10_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, tx_type);
break;
case TX_8X8:
@@ -1062,7 +1061,7 @@
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
- if (!x->skip_encode && *eob)
+ if (*eob)
vp10_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, tx_type);
break;
case TX_4X4:
@@ -1076,7 +1075,7 @@
scan_order->iscan);
}
- if (!x->skip_encode && *eob) {
+ if (*eob) {
// this is like vp10_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
diff --git a/vp10/encoder/firstpass.c b/vp10/encoder/firstpass.c
index 63c42cd..d155316 100644
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@@ -614,7 +614,6 @@
cm->mi_rows, cm->mi_cols);
// Do intra 16x16 prediction.
- x->skip_encode = 0;
xd->mi[0]->mbmi.mode = DC_PRED;
xd->mi[0]->mbmi.tx_size = use_dc_pred ?
(bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 24d359e..fd53bb6 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -438,19 +438,6 @@
&this_sse) >> shift;
#endif // CONFIG_VP9_HIGHBITDEPTH
*out_sse = this_sse >> shift;
-
- if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
- // TODO(jingning): tune the model to better capture the distortion.
- int64_t p = (pd->dequant[1] * pd->dequant[1] *
- (1 << ss_txfrm_size)) >>
-#if CONFIG_VP9_HIGHBITDEPTH
- (shift + 2 + (bd - 8) * 2);
-#else
- (shift + 2);
-#endif // CONFIG_VP9_HIGHBITDEPTH
- *out_dist += (p >> 4);
- *out_sse += p;
- }
}
static int rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -903,9 +890,7 @@
vp10_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
xd->mi[0]->bmi[block].as_mode = mode;
- vp10_predict_intra_block(xd, 1, TX_4X4, mode,
- x->skip_encode ? src : dst,
- x->skip_encode ? src_stride : dst_stride,
+ vp10_predict_intra_block(xd, 1, TX_4X4, mode, dst, dst_stride,
dst, dst_stride, col + idx, row + idy, 0);
vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
@@ -961,7 +946,7 @@
{}
}
- if (best_rd >= rd_thresh || x->skip_encode)
+ if (best_rd >= rd_thresh)
return best_rd;
for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
@@ -2739,7 +2724,6 @@
int y_skip = 0, uv_skip = 0;
int64_t dist_y = 0, dist_uv = 0;
TX_SIZE max_uv_tx_size;
- x->skip_encode = 0;
ctx->skip = 0;
xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
xd->mi[0]->mbmi.ref_frame[1] = NONE;
@@ -2912,14 +2896,13 @@
}
void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
- TileDataEnc *tile_data,
- MACROBLOCK *x,
- int mi_row, int mi_col,
- RD_COST *rd_cost, BLOCK_SIZE bsize,
- PICK_MODE_CONTEXT *ctx,
- int64_t best_rd_so_far) {
+ TileDataEnc *tile_data,
+ MACROBLOCK *x,
+ int mi_row, int mi_col,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
VP10_COMMON *const cm = &cpi->common;
- TileInfo *const tile_info = &tile_data->tile_info;
RD_OPT *const rd_opt = &cpi->rd;
SPEED_FEATURES *const sf = &cpi->sf;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -2970,8 +2953,6 @@
vp10_zero(best_mbmode);
- x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
-
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
filter_cache[i] = INT64_MAX;
@@ -3013,7 +2994,7 @@
// are masked out.
ref_frame_skip_mask[0] |= (1 << ref_frame);
ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
- } else if (sf->reference_masking) {
+ } else {
for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
// Skip fixed mv modes for poor references
if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
@@ -3152,55 +3133,6 @@
if (best_rd < mode_threshold[mode_index])
continue;
- if (sf->motion_field_mode_search) {
- const int mi_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
- tile_info->mi_col_end - mi_col);
- const int mi_height = VPXMIN(num_8x8_blocks_high_lookup[bsize],
- tile_info->mi_row_end - mi_row);
- const int bsl = mi_width_log2_lookup[bsize];
- int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
- + get_chessboard_index(cm->current_video_frame)) & 0x1;
- MB_MODE_INFO *ref_mbmi;
- int const_motion = 1;
- int skip_ref_frame = !cb_partition_search_ctrl;
- MV_REFERENCE_FRAME rf = NONE;
- int_mv ref_mv;
- ref_mv.as_int = INVALID_MV;
-
- if ((mi_row - 1) >= tile_info->mi_row_start) {
- ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
- rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
- for (i = 0; i < mi_width; ++i) {
- ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
- const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
- (ref_frame == ref_mbmi->ref_frame[0]);
- skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
- }
- }
-
- if ((mi_col - 1) >= tile_info->mi_col_start) {
- if (ref_mv.as_int == INVALID_MV)
- ref_mv = xd->mi[-1]->mbmi.mv[0];
- if (rf == NONE)
- rf = xd->mi[-1]->mbmi.ref_frame[0];
- for (i = 0; i < mi_height; ++i) {
- ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
- const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
- (ref_frame == ref_mbmi->ref_frame[0]);
- skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
- }
- }
-
- if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
- if (rf > INTRA_FRAME)
- if (ref_frame != rf)
- continue;
-
- if (const_motion)
- if (this_mode == NEARMV || this_mode == ZEROMV)
- continue;
- }
-
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
if (!cpi->allow_comp_inter_inter)
@@ -3613,8 +3545,6 @@
int rate2 = 0;
const int64_t distortion2 = 0;
- x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
-
estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
&comp_mode_p);
@@ -3738,7 +3668,6 @@
int internal_active_edge =
vp10_active_edge_sb(cpi, mi_row, mi_col) && vp10_internal_image_edge(cpi);
- x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
memset(x->zcoeff_blk[TX_4X4], 0, 4);
vp10_zero(best_mbmode);
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index 156c4ea..d40383f 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -148,9 +148,6 @@
sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
: USE_LARGESTALL;
- // Reference masking is not supported in dynamic scaling mode.
- sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
-
sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
@@ -192,7 +189,6 @@
sf->use_lp32x32fdct = 1;
sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
sf->use_fast_coef_costing = 1;
- sf->motion_field_mode_search = !boosted;
sf->partition_search_breakout_rate_thr = 300;
}
@@ -280,13 +276,6 @@
FLAG_SKIP_COMP_BESTINTRA |
FLAG_SKIP_INTRA_LOWVAR;
sf->adaptive_pred_interp_filter = 2;
-
- // Disable reference masking if using spatial scaling since
- // pred_mv_sad will not be set (since vp10_mv_pred will not
- // be called).
- // TODO(marpan/agrange): Fix this condition.
- sf->reference_masking = (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) ? 1 : 0;
-
sf->disable_filter_search_var_thresh = 50;
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
@@ -302,7 +291,6 @@
sf->use_square_partition_only = 1;
sf->disable_filter_search_var_thresh = 100;
sf->use_uv_intra_rd_estimate = 1;
- sf->skip_encode_sb = 1;
sf->mv.subpel_iters_per_step = 1;
sf->adaptive_rd_thresh = 4;
sf->mode_skip_start = 6;
@@ -385,7 +373,6 @@
// Turn on this to use non-RD key frame coding mode.
sf->mv.search_method = NSTEP;
sf->mv.reduce_first_step_size = 1;
- sf->skip_encode_sb = 0;
}
if (speed >= 7) {
@@ -457,10 +444,8 @@
sf->adaptive_mode_search = 0;
sf->cb_pred_filter_search = 0;
sf->cb_partition_search = 0;
- sf->motion_field_mode_search = 0;
sf->alt_ref_search_fp = 0;
sf->use_quant_fp = 0;
- sf->reference_masking = 0;
sf->partition_search_type = SEARCH_PARTITION;
sf->less_rectangular_check = 0;
sf->use_square_partition_only = 0;
@@ -483,7 +468,6 @@
sf->intra_uv_mode_mask[i] = INTRA_ALL;
}
sf->use_rd_breakout = 0;
- sf->skip_encode_sb = 0;
sf->use_uv_intra_rd_estimate = 0;
sf->allow_skip_recode = 0;
sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index 08a1893..3969a2f 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -223,11 +223,6 @@
// mode to be evaluated. A high value means we will be faster.
int adaptive_rd_thresh;
- // Enables skipping the reconstruction step (idct, recon) in the
- // intermediate steps assuming the last frame didn't have too many intra
- // blocks and the q is less than a threshold.
- int skip_encode_sb;
- int skip_encode_frame;
// Speed feature to allow or disallow skipping of recode at block
// level within a frame.
int allow_skip_recode;
@@ -253,9 +248,6 @@
// of the best so far.
int mode_skip_start;
- // TODO(JBB): Remove this.
- int reference_masking;
-
PARTITION_SEARCH_TYPE partition_search_type;
// Used if partition_search_type = FIXED_SIZE_PARTITION
@@ -314,8 +306,6 @@
int cb_partition_search;
- int motion_field_mode_search;
-
int alt_ref_search_fp;
// Fast quantization process path
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 16a723c..2eb3488 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -19,7 +19,6 @@
VP10_COMMON_SRCS-yes += common/entropymv.c
VP10_COMMON_SRCS-yes += common/frame_buffers.c
VP10_COMMON_SRCS-yes += common/frame_buffers.h
-VP10_COMMON_SRCS-yes += common/idct.c
VP10_COMMON_SRCS-yes += common/alloccommon.h
VP10_COMMON_SRCS-yes += common/blockd.h
VP10_COMMON_SRCS-yes += common/common.h
@@ -30,6 +29,9 @@
VP10_COMMON_SRCS-yes += common/filter.h
VP10_COMMON_SRCS-yes += common/filter.c
VP10_COMMON_SRCS-yes += common/idct.h
+VP10_COMMON_SRCS-yes += common/idct.c
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm.c
VP10_COMMON_SRCS-yes += common/loopfilter.h
VP10_COMMON_SRCS-yes += common/thread_common.h
VP10_COMMON_SRCS-yes += common/mv.h
@@ -59,6 +61,8 @@
VP10_COMMON_SRCS-yes += common/common_data.h
VP10_COMMON_SRCS-yes += common/scan.c
VP10_COMMON_SRCS-yes += common/scan.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c
VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.h
VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.c
@@ -85,10 +89,16 @@
endif
VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_dct32x32_impl_sse2.h
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_impl_sse2.h
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
endif
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.h
+
$(eval $(call rtcd_h_template,vp10_rtcd,vp10/common/vp10_rtcd_defs.pl))
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index a96cf5c..ebef1a2 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -235,6 +235,12 @@
if (!is_inter_block(mbmi) || !skip)
cr->last_coded_q_map[map_offset] = clamp(
cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
+ else if (is_inter_block(mbmi) && skip) {
+ cr->last_coded_q_map[map_offset] = VPXMIN(
+ clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
+ 0, MAXQ),
+ cr->last_coded_q_map[map_offset]);
+ }
}
}
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 1839bfe..19b0beb 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -688,10 +688,11 @@
}
#endif
-static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
+static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
- unsigned int *var_y, unsigned int *sse_y) {
+ unsigned int *var_y, unsigned int *sse_y,
+ int start_plane, int stop_plane) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
@@ -703,12 +704,12 @@
*out_rate_sum = 0;
*out_dist_sum = 0;
- for (i = 1; i <= 2; ++i) {
+ for (i = start_plane; i <= stop_plane; ++i) {
struct macroblock_plane *const p = &x->plane[i];
struct macroblockd_plane *const pd = &xd->plane[i];
const uint32_t dc_quant = pd->dequant[0];
const uint32_t ac_quant = pd->dequant[1];
- const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+ const BLOCK_SIZE bs = plane_bsize;
unsigned int var;
if (!x->color_sensitivity[i - 1])
@@ -893,12 +894,8 @@
int i, j;
int rate;
int64_t dist;
- int64_t this_sse = INT64_MAX;
- int is_skippable;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
- assert(plane == 0);
- (void) plane;
p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
@@ -908,13 +905,22 @@
x->skip_encode ? p->src.buf : pd->dst.buf,
x->skip_encode ? src_stride : dst_stride,
pd->dst.buf, dst_stride,
- i, j, 0);
+ i, j, plane);
- // TODO(jingning): This needs further refactoring.
- block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
- bsize_tx, VPXMIN(tx_size, TX_16X16));
- x->skip_txfm[0] = is_skippable;
- rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+ if (plane == 0) {
+ int64_t this_sse = INT64_MAX;
+ int is_skippable;
+ // TODO(jingning): This needs further refactoring.
+ block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+ bsize_tx, VPXMIN(tx_size, TX_16X16));
+ x->skip_txfm[0] = is_skippable;
+ // TODO(jingning): Skip is signalled per prediciton block not per tx block.
+ rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+ } else {
+ unsigned int var, sse;
+ model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &rate, &dist, &var, &sse,
+ plane, plane);
+ }
p->src.buf = src_buf_base;
pd->dst.buf = dst_buf_base;
@@ -1443,12 +1449,13 @@
if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
int uv_rate = 0;
int64_t uv_dist = 0;
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
if (x->color_sensitivity[0])
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
if (x->color_sensitivity[1])
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
- model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
- &var_y, &sse_y);
+ model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &uv_rate, &uv_dist,
+ &var_y, &sse_y, 1, 2);
this_rdc.rate += uv_rate;
this_rdc.dist += uv_dist;
}
@@ -1571,6 +1578,15 @@
mbmi->tx_size = intra_tx_size;
vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
estimate_block_intra, &args);
+ // Inter and intra RD will mismatch in scale for non-screen content.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+ if (x->color_sensitivity[0])
+ vp9_foreach_transformed_block_in_plane(xd, bsize, 1,
+ estimate_block_intra, &args);
+ if (x->color_sensitivity[1])
+ vp9_foreach_transformed_block_in_plane(xd, bsize, 2,
+ estimate_block_intra, &args);
+ }
this_rdc.rate = args.rate;
this_rdc.dist = args.dist;
this_rdc.rate += cpi->mbmode_cost[this_mode];