Merge "Remove speed features in vp10"
diff --git a/examples.mk b/examples.mk
index dfa5a65..f10bec6 100644
--- a/examples.mk
+++ b/examples.mk
@@ -36,6 +36,8 @@
third_party/libyuv/source/scale_neon64.cc \
third_party/libyuv/source/scale_win.cc \
+LIBWEBM_COMMON_SRCS += third_party/libwebm/webmids.hpp
+
LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
third_party/libwebm/mkvmuxerutil.cpp \
third_party/libwebm/mkvwriter.cpp \
@@ -43,8 +45,7 @@
third_party/libwebm/mkvmuxertypes.hpp \
third_party/libwebm/mkvmuxerutil.hpp \
third_party/libwebm/mkvparser.hpp \
- third_party/libwebm/mkvwriter.hpp \
- third_party/libwebm/webmids.hpp
+ third_party/libwebm/mkvwriter.hpp
LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
third_party/libwebm/mkvreader.cpp \
@@ -68,6 +69,7 @@
vpxdec.SRCS += $(LIBYUV_SRCS)
endif
ifeq ($(CONFIG_WEBM_IO),yes)
+ vpxdec.SRCS += $(LIBWEBM_COMMON_SRCS)
vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS)
vpxdec.SRCS += webmdec.cc webmdec.h
endif
@@ -89,6 +91,7 @@
vpxenc.SRCS += $(LIBYUV_SRCS)
endif
ifeq ($(CONFIG_WEBM_IO),yes)
+ vpxenc.SRCS += $(LIBWEBM_COMMON_SRCS)
vpxenc.SRCS += $(LIBWEBM_MUXER_SRCS)
vpxenc.SRCS += webmenc.cc webmenc.h
endif
diff --git a/test/test.mk b/test/test.mk
index 6bb08be..3361c91 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -167,6 +167,7 @@
TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
endif # CONFIG_SHARED
include $(SRC_PATH_BARE)/test/test-data.mk
diff --git a/test/vp10_inv_txfm_test.cc b/test/vp10_inv_txfm_test.cc
new file mode 100644
index 0000000..c49081e
--- /dev/null
+++ b/test/vp10_inv_txfm_test.cc
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/scan.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const double PI = 3.141592653589793238462643383279502884;
+const double kInvSqrt2 = 0.707106781186547524400844362104;
+
+void reference_idct_1d(const double *in, double *out, int size) {
+ for (int n = 0; n < size; ++n) {
+ out[n] = 0;
+ for (int k = 0; k < size; ++k) {
+ if (k == 0)
+ out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+ else
+ out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+ }
+ }
+}
+
+typedef void (*IdctFuncRef)(const double *in, double *out, int size);
+typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
+
+class TransTestBase {
+ public:
+ virtual ~TransTestBase() {}
+
+ protected:
+ void RunInvAccuracyCheck() {
+ tran_low_t *input = new tran_low_t[txfm_size_];
+ tran_low_t *output = new tran_low_t[txfm_size_];
+ double *ref_input = new double[txfm_size_];
+ double *ref_output = new double[txfm_size_];
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+ for (int ti = 0; ti < count_test_block; ++ti) {
+ for (int ni = 0; ni < txfm_size_; ++ni) {
+ input[ni] = rnd.Rand8() - rnd.Rand8();
+ ref_input[ni] = static_cast<double>(input[ni]);
+ }
+
+ fwd_txfm_(input, output);
+ fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
+
+ for (int ni = 0; ni < txfm_size_; ++ni) {
+ EXPECT_LE(
+ abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
+ max_error_);
+ }
+ }
+
+ delete[] input;
+ delete[] output;
+ delete[] ref_input;
+ delete[] ref_output;
+ }
+
+ double max_error_;
+ int txfm_size_;
+ IdctFunc fwd_txfm_;
+ IdctFuncRef fwd_txfm_ref_;
+};
+
+typedef std::tr1::tuple<IdctFunc, IdctFuncRef, int, int> IdctParam;
+class Vp10InvTxfm
+ : public TransTestBase,
+ public ::testing::TestWithParam<IdctParam> {
+ public:
+ virtual void SetUp() {
+ fwd_txfm_ = GET_PARAM(0);
+ fwd_txfm_ref_ = GET_PARAM(1);
+ txfm_size_ = GET_PARAM(2);
+ max_error_ = GET_PARAM(3);
+ }
+ virtual void TearDown() {}
+};
+
+TEST_P(Vp10InvTxfm, RunInvAccuracyCheck) {
+ RunInvAccuracyCheck();
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, Vp10InvTxfm,
+ ::testing::Values(
+ IdctParam(&vp10_idct4_c, &reference_idct_1d, 4, 1),
+ IdctParam(&vp10_idct8_c, &reference_idct_1d, 8, 2),
+ IdctParam(&vp10_idct16_c, &reference_idct_1d, 16, 4),
+ IdctParam(&vp10_idct32_c, &reference_idct_1d, 32, 6))
+);
+
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef std::tr1::tuple<FwdTxfmFunc,
+ InvTxfmFunc,
+ InvTxfmFunc,
+ TX_SIZE, int> PartialInvTxfmParam;
+const int kMaxNumCoeffs = 1024;
+class Vp10PartialIDctTest
+ : public ::testing::TestWithParam<PartialInvTxfmParam> {
+ public:
+ virtual ~Vp10PartialIDctTest() {}
+ virtual void SetUp() {
+ ftxfm_ = GET_PARAM(0);
+ full_itxfm_ = GET_PARAM(1);
+ partial_itxfm_ = GET_PARAM(2);
+ tx_size_ = GET_PARAM(3);
+ last_nonzero_ = GET_PARAM(4);
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ int last_nonzero_;
+ TX_SIZE tx_size_;
+ FwdTxfmFunc ftxfm_;
+ InvTxfmFunc full_itxfm_;
+ InvTxfmFunc partial_itxfm_;
+};
+
+TEST_P(Vp10PartialIDctTest, RunQuantCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int size;
+ switch (tx_size_) {
+ case TX_4X4:
+ size = 4;
+ break;
+ case TX_8X8:
+ size = 8;
+ break;
+ case TX_16X16:
+ size = 16;
+ break;
+ case TX_32X32:
+ size = 32;
+ break;
+ default:
+ FAIL() << "Wrong Size!";
+ break;
+ }
+ DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+
+ const int count_test_block = 1000;
+ const int block_size = size * size;
+
+ DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
+
+ int max_error = 0;
+ for (int i = 0; i < count_test_block; ++i) {
+ // clear out destination buffer
+ memset(dst1, 0, sizeof(*dst1) * block_size);
+ memset(dst2, 0, sizeof(*dst2) * block_size);
+ memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+ memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ if (i == 0) {
+ for (int j = 0; j < block_size; ++j)
+ input_extreme_block[j] = 255;
+ } else if (i == 1) {
+ for (int j = 0; j < block_size; ++j)
+ input_extreme_block[j] = -255;
+ } else {
+ for (int j = 0; j < block_size; ++j) {
+ input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+ }
+ }
+
+ ftxfm_(input_extreme_block, output_ref_block, size);
+
+ // quantization with maximum allowed step sizes
+ test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+ for (int j = 1; j < last_nonzero_; ++j)
+ test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
+ = (output_ref_block[j] / 1828) * 1828;
+ }
+
+ ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+ ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+
+ for (int j = 0; j < block_size; ++j) {
+ const int diff = dst1[j] - dst2[j];
+ const int error = diff * diff;
+ if (max_error < error)
+ max_error = error;
+ }
+ }
+
+ EXPECT_EQ(0, max_error)
+ << "Error: partial inverse transform produces different results";
+}
+
+TEST_P(Vp10PartialIDctTest, ResultsMatch) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int size;
+ switch (tx_size_) {
+ case TX_4X4:
+ size = 4;
+ break;
+ case TX_8X8:
+ size = 8;
+ break;
+ case TX_16X16:
+ size = 16;
+ break;
+ case TX_32X32:
+ size = 32;
+ break;
+ default:
+ FAIL() << "Wrong Size!";
+ break;
+ }
+ DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+ DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+ const int count_test_block = 1000;
+ const int max_coeff = 32766 / 4;
+ const int block_size = size * size;
+ int max_error = 0;
+ for (int i = 0; i < count_test_block; ++i) {
+ // clear out destination buffer
+ memset(dst1, 0, sizeof(*dst1) * block_size);
+ memset(dst2, 0, sizeof(*dst2) * block_size);
+ memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+ memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+ int max_energy_leftover = max_coeff * max_coeff;
+ for (int j = 0; j < last_nonzero_; ++j) {
+ int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+ (rnd.Rand16() - 32768) / 65536);
+ max_energy_leftover -= coef * coef;
+ if (max_energy_leftover < 0) {
+ max_energy_leftover = 0;
+ coef = 0;
+ }
+ test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
+ }
+
+ memcpy(test_coef_block2, test_coef_block1,
+ sizeof(*test_coef_block2) * block_size);
+
+ ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+ ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+
+ for (int j = 0; j < block_size; ++j) {
+ const int diff = dst1[j] - dst2[j];
+ const int error = diff * diff;
+ if (max_error < error)
+ max_error = error;
+ }
+ }
+
+ EXPECT_EQ(0, max_error)
+ << "Error: partial inverse transform produces different results";
+}
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+ C, Vp10PartialIDctTest,
+ ::testing::Values(
+ make_tuple(&vpx_fdct32x32_c,
+ &vp10_idct32x32_1024_add_c,
+ &vp10_idct32x32_34_add_c,
+ TX_32X32, 34),
+ make_tuple(&vpx_fdct32x32_c,
+ &vp10_idct32x32_1024_add_c,
+ &vp10_idct32x32_1_add_c,
+ TX_32X32, 1),
+ make_tuple(&vpx_fdct16x16_c,
+ &vp10_idct16x16_256_add_c,
+ &vp10_idct16x16_10_add_c,
+ TX_16X16, 10),
+ make_tuple(&vpx_fdct16x16_c,
+ &vp10_idct16x16_256_add_c,
+ &vp10_idct16x16_1_add_c,
+ TX_16X16, 1),
+ make_tuple(&vpx_fdct8x8_c,
+ &vp10_idct8x8_64_add_c,
+ &vp10_idct8x8_12_add_c,
+ TX_8X8, 12),
+ make_tuple(&vpx_fdct8x8_c,
+ &vp10_idct8x8_64_add_c,
+ &vp10_idct8x8_1_add_c,
+ TX_8X8, 1),
+ make_tuple(&vpx_fdct4x4_c,
+ &vp10_idct4x4_16_add_c,
+ &vp10_idct4x4_1_add_c,
+ TX_4X4, 1)));
+} // namespace
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 91875e1..f07a52b 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,7 +1,10 @@
URL: https://chromium.googlesource.com/webm/libwebm
-Version: 2dec09426ab62b794464cc9971bd135b4d313e65
+Version: a58c32339e06e5d672a58cdd5844cea0a661e735
License: BSD
License File: LICENSE.txt
Description:
libwebm is used to handle WebM container I/O.
+
+Local Changes:
+* <none>
diff --git a/third_party/libwebm/mkvmuxer.hpp b/third_party/libwebm/mkvmuxer.hpp
index 497ad4c..03a002c 100644
--- a/third_party/libwebm/mkvmuxer.hpp
+++ b/third_party/libwebm/mkvmuxer.hpp
@@ -528,7 +528,7 @@
public:
// Audio and video type defined by the Matroska specs.
enum { kVideo = 0x1, kAudio = 0x2 };
- // Opus, Vorbis, VP8, and VP9 codec ids defined by the Matroska specs.
+
static const char kOpusCodecId[];
static const char kVorbisCodecId[];
static const char kVp8CodecId[];
diff --git a/third_party/libwebm/mkvparser.cpp b/third_party/libwebm/mkvparser.cpp
index fc01be5..4306a51 100644
--- a/third_party/libwebm/mkvparser.cpp
+++ b/third_party/libwebm/mkvparser.cpp
@@ -7,45 +7,51 @@
// be found in the AUTHORS file in the root of the source tree.
#include "mkvparser.hpp"
+
#include <cassert>
+#include <climits>
+#include <cmath>
#include <cstring>
#include <new>
-#include <climits>
+
+#include "webmids.hpp"
#ifdef _MSC_VER
// Disable MSVC warnings that suggest making code non-portable.
#pragma warning(disable : 4996)
#endif
-mkvparser::IMkvReader::~IMkvReader() {}
+namespace mkvparser {
-void mkvparser::GetVersion(int& major, int& minor, int& build, int& revision) {
+IMkvReader::~IMkvReader() {}
+
+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+ unsigned long long element_size) {
+ if (num_elements == 0 || element_size == 0)
+ return NULL;
+
+ const size_t kMaxAllocSize = 0x80000000; // 2GiB
+ const unsigned long long num_bytes = num_elements * element_size;
+ if (element_size > (kMaxAllocSize / num_elements))
+ return NULL;
+
+ return new (std::nothrow) Type[num_bytes];
+}
+
+void GetVersion(int& major, int& minor, int& build, int& revision) {
major = 1;
minor = 0;
build = 0;
revision = 30;
}
-long long mkvparser::ReadUInt(IMkvReader* pReader, long long pos, long& len) {
- assert(pReader);
- assert(pos >= 0);
-
- int status;
-
- //#ifdef _DEBUG
- // long long total, available;
- // status = pReader->Length(&total, &available);
- // assert(status >= 0);
- // assert((total < 0) || (available <= total));
- // assert(pos < available);
- // assert((available - pos) >= 1); //assume here max u-int len is 8
- //#endif
+long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
+ if (!pReader || pos < 0)
+ return E_FILE_FORMAT_INVALID;
len = 1;
-
unsigned char b;
-
- status = pReader->Read(pos, 1, &b);
+ int status = pReader->Read(pos, 1, &b);
if (status < 0) // error or underflow
return status;
@@ -63,10 +69,6 @@
++len;
}
- //#ifdef _DEBUG
- // assert((available - pos) >= len);
- //#endif
-
long long result = b & (~m);
++pos;
@@ -92,16 +94,76 @@
return result;
}
-long long mkvparser::GetUIntLength(IMkvReader* pReader, long long pos,
- long& len) {
- assert(pReader);
- assert(pos >= 0);
+// Reads an EBML ID and returns it.
+// An ID must at least 1 byte long, cannot exceed 4, and its value must be
+// greater than 0.
+// See known EBML values and EBMLMaxIDLength:
+// http://www.matroska.org/technical/specs/index.html
+// Returns the ID, or a value less than 0 to report an error while reading the
+// ID.
+long long ReadID(IMkvReader* pReader, long long pos, long& len) {
+ if (pReader == NULL || pos < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ // Read the first byte. The length in bytes of the ID is determined by
+ // finding the first set bit in the first byte of the ID.
+ unsigned char temp_byte = 0;
+ int read_status = pReader->Read(pos, 1, &temp_byte);
+
+ if (read_status < 0)
+ return E_FILE_FORMAT_INVALID;
+ else if (read_status > 0) // No data to read.
+ return E_BUFFER_NOT_FULL;
+
+ if (temp_byte == 0) // ID length > 8 bytes; invalid file.
+ return E_FILE_FORMAT_INVALID;
+
+ int bit_pos = 0;
+ const int kMaxIdLengthInBytes = 4;
+ const int kCheckByte = 0x80;
+
+ // Find the first bit that's set.
+ bool found_bit = false;
+ for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) {
+ if ((kCheckByte >> bit_pos) & temp_byte) {
+ found_bit = true;
+ break;
+ }
+ }
+
+ if (!found_bit) {
+ // The value is too large to be a valid ID.
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ // Read the remaining bytes of the ID (if any).
+ const int id_length = bit_pos + 1;
+ long long ebml_id = temp_byte;
+ for (int i = 1; i < id_length; ++i) {
+ ebml_id <<= 8;
+ read_status = pReader->Read(pos + i, 1, &temp_byte);
+
+ if (read_status < 0)
+ return E_FILE_FORMAT_INVALID;
+ else if (read_status > 0)
+ return E_BUFFER_NOT_FULL;
+
+ ebml_id |= temp_byte;
+ }
+
+ len = id_length;
+ return ebml_id;
+}
+
+long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) {
+ if (!pReader || pos < 0)
+ return E_FILE_FORMAT_INVALID;
long long total, available;
int status = pReader->Length(&total, &available);
- assert(status >= 0);
- assert((total < 0) || (available <= total));
+ if (status < 0 || (total >= 0 && available > total))
+ return E_FILE_FORMAT_INVALID;
len = 1;
@@ -112,11 +174,9 @@
status = pReader->Read(pos, 1, &b);
- if (status < 0)
+ if (status != 0)
return status;
- assert(status == 0);
-
if (b == 0) // we can't handle u-int values larger than 8 bytes
return E_FILE_FORMAT_INVALID;
@@ -132,12 +192,8 @@
// TODO(vigneshv): This function assumes that unsigned values never have their
// high bit set.
-long long mkvparser::UnserializeUInt(IMkvReader* pReader, long long pos,
- long long size) {
- assert(pReader);
- assert(pos >= 0);
-
- if ((size <= 0) || (size > 8))
+long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) {
+ if (!pReader || pos < 0 || (size <= 0) || (size > 8))
return E_FILE_FORMAT_INVALID;
long long result = 0;
@@ -159,12 +215,9 @@
return result;
}
-long mkvparser::UnserializeFloat(IMkvReader* pReader, long long pos,
- long long size_, double& result) {
- assert(pReader);
- assert(pos >= 0);
-
- if ((size_ != 4) && (size_ != 8))
+long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
+ double& result) {
+ if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8)))
return E_FILE_FORMAT_INVALID;
const long size = static_cast<long>(size_);
@@ -195,8 +248,6 @@
result = f;
} else {
- assert(size == 8);
-
union {
double d;
unsigned long long dd;
@@ -216,28 +267,25 @@
result = d;
}
+ if (std::isinf(result) || std::isnan(result))
+ return E_FILE_FORMAT_INVALID;
+
return 0;
}
-long mkvparser::UnserializeInt(IMkvReader* pReader, long long pos,
- long long size, long long& result) {
- assert(pReader);
- assert(pos >= 0);
- assert(size > 0);
- assert(size <= 8);
+long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
+ long long& result_ref) {
+ if (!pReader || pos < 0 || size < 1 || size > 8)
+ return E_FILE_FORMAT_INVALID;
- {
- signed char b;
+ signed char first_byte = 0;
+ const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte);
- const long status = pReader->Read(pos, 1, (unsigned char*)&b);
+ if (status < 0)
+ return status;
- if (status < 0)
- return status;
-
- result = b;
-
- ++pos;
- }
+ unsigned long long result = first_byte;
+ ++pos;
for (long i = 1; i < size; ++i) {
unsigned char b;
@@ -253,23 +301,24 @@
++pos;
}
- return 0; // success
+ result_ref = static_cast<long long>(result);
+ return 0;
}
-long mkvparser::UnserializeString(IMkvReader* pReader, long long pos,
- long long size_, char*& str) {
+long UnserializeString(IMkvReader* pReader, long long pos, long long size,
+ char*& str) {
delete[] str;
str = NULL;
- if (size_ >= LONG_MAX) // we need (size+1) chars
+ if (size >= LONG_MAX || size < 0)
return E_FILE_FORMAT_INVALID;
- const long size = static_cast<long>(size_);
+ // +1 for '\0' terminator
+ const long required_size = static_cast<long>(size) + 1;
- str = new (std::nothrow) char[size + 1];
-
+ str = SafeArrayAlloc<char>(1, required_size);
if (str == NULL)
- return -1;
+ return E_FILE_FORMAT_INVALID;
unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
@@ -282,137 +331,149 @@
return status;
}
- str[size] = '\0';
-
- return 0; // success
+ str[required_size - 1] = '\0';
+ return 0;
}
-long mkvparser::ParseElementHeader(IMkvReader* pReader, long long& pos,
- long long stop, long long& id,
- long long& size) {
- if ((stop >= 0) && (pos >= stop))
+long ParseElementHeader(IMkvReader* pReader, long long& pos,
+ long long stop, long long& id,
+ long long& size) {
+ if (stop >= 0 && pos >= stop)
return E_FILE_FORMAT_INVALID;
long len;
- id = ReadUInt(pReader, pos, len);
+ id = ReadID(pReader, pos, len);
if (id < 0)
return E_FILE_FORMAT_INVALID;
pos += len; // consume id
- if ((stop >= 0) && (pos >= stop))
+ if (stop >= 0 && pos >= stop)
return E_FILE_FORMAT_INVALID;
size = ReadUInt(pReader, pos, len);
- if (size < 0)
+ if (size < 0 || len < 1 || len > 8) {
+ // Invalid: Negative payload size, negative or 0 length integer, or integer
+ // larger than 64 bits (libwebm cannot handle them).
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ // Avoid rolling over pos when very close to LONG_LONG_MAX.
+ const unsigned long long rollover_check =
+ static_cast<unsigned long long>(pos) + len;
+ if (rollover_check > LONG_LONG_MAX)
return E_FILE_FORMAT_INVALID;
pos += len; // consume length of size
// pos now designates payload
- if ((stop >= 0) && ((pos + size) > stop))
+ if (stop >= 0 && pos >= stop)
return E_FILE_FORMAT_INVALID;
return 0; // success
}
-bool mkvparser::Match(IMkvReader* pReader, long long& pos, unsigned long id_,
- long long& val) {
- assert(pReader);
- assert(pos >= 0);
-
- long long total, available;
-
- const long status = pReader->Length(&total, &available);
- assert(status >= 0);
- assert((total < 0) || (available <= total));
- if (status < 0)
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+ long long& val) {
+ if (!pReader || pos < 0)
return false;
- long len;
+ long long total = 0;
+ long long available = 0;
- const long long id = ReadUInt(pReader, pos, len);
- assert(id >= 0);
- assert(len > 0);
- assert(len <= 8);
- assert((pos + len) <= available);
+ const long status = pReader->Length(&total, &available);
+ if (status < 0 || (total >= 0 && available > total))
+ return false;
- if ((unsigned long)id != id_)
+ long len = 0;
+
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (available - pos) > len)
+ return false;
+
+ if (static_cast<unsigned long>(id) != expected_id)
return false;
pos += len; // consume id
const long long size = ReadUInt(pReader, pos, len);
- assert(size >= 0);
- assert(size <= 8);
- assert(len > 0);
- assert(len <= 8);
- assert((pos + len) <= available);
+ if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len)
+ return false;
pos += len; // consume length of size of payload
val = UnserializeUInt(pReader, pos, size);
- assert(val >= 0);
+ if (val < 0)
+ return false;
pos += size; // consume size of payload
return true;
}
-bool mkvparser::Match(IMkvReader* pReader, long long& pos, unsigned long id_,
- unsigned char*& buf, size_t& buflen) {
- assert(pReader);
- assert(pos >= 0);
-
- long long total, available;
-
- long status = pReader->Length(&total, &available);
- assert(status >= 0);
- assert((total < 0) || (available <= total));
- if (status < 0)
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+ unsigned char*& buf, size_t& buflen) {
+ if (!pReader || pos < 0)
return false;
- long len;
- const long long id = ReadUInt(pReader, pos, len);
- assert(id >= 0);
- assert(len > 0);
- assert(len <= 8);
- assert((pos + len) <= available);
+ long long total = 0;
+ long long available = 0;
- if ((unsigned long)id != id_)
+ long status = pReader->Length(&total, &available);
+ if (status < 0 || (total >= 0 && available > total))
+ return false;
+
+ long len = 0;
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (available - pos) > len)
+ return false;
+
+ if (static_cast<unsigned long>(id) != expected_id)
return false;
pos += len; // consume id
- const long long size_ = ReadUInt(pReader, pos, len);
- assert(size_ >= 0);
- assert(len > 0);
- assert(len <= 8);
- assert((pos + len) <= available);
+ const long long size = ReadUInt(pReader, pos, len);
+ if (size < 0 || len <= 0 || len > 8 || (available - pos) > len)
+ return false;
+
+ unsigned long long rollover_check =
+ static_cast<unsigned long long>(pos) + len;
+ if (rollover_check > LONG_LONG_MAX)
+ return false;
pos += len; // consume length of size of payload
- assert((pos + size_) <= available);
- const long buflen_ = static_cast<long>(size_);
+ rollover_check = static_cast<unsigned long long>(pos) + size;
+ if (rollover_check > LONG_LONG_MAX)
+ return false;
- buf = new (std::nothrow) unsigned char[buflen_];
- assert(buf); // TODO
+ if ((pos + size) > available)
+ return false;
+
+ if (size >= LONG_MAX)
+ return false;
+
+ const long buflen_ = static_cast<long>(size);
+
+ buf = SafeArrayAlloc<unsigned char>(1, buflen_);
+ if (!buf)
+ return false;
status = pReader->Read(pos, buflen_, buf);
- assert(status == 0); // TODO
+ if (status != 0)
+ return false;
buflen = buflen_;
- pos += size_; // consume size of payload
+ pos += size; // consume size of payload
return true;
}
-namespace mkvparser {
-
EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); }
EBMLHeader::~EBMLHeader() { delete[] m_docType; }
@@ -433,7 +494,8 @@
}
long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
- assert(pReader);
+ if (!pReader)
+ return E_FILE_FORMAT_INVALID;
long long total, available;
@@ -445,67 +507,45 @@
pos = 0;
long long end = (available >= 1024) ? 1024 : available;
- for (;;) {
- unsigned char b = 0;
+ // Scan until we find what looks like the first byte of the EBML header.
+ const int kMaxScanBytes = (available >= 1024) ? 1024 : available;
+ const unsigned char kEbmlByte0 = 0x1A;
+ unsigned char scan_byte = 0;
- while (pos < end) {
- status = pReader->Read(pos, 1, &b);
+ while (pos < kMaxScanBytes) {
+ status = pReader->Read(pos, 1, &scan_byte);
- if (status < 0) // error
- return status;
+ if (status < 0) // error
+ return status;
+ else if (status > 0)
+ return E_BUFFER_NOT_FULL;
- if (b == 0x1A)
- break;
-
- ++pos;
- }
-
- if (b != 0x1A) {
- if (pos >= 1024)
- return E_FILE_FORMAT_INVALID; // don't bother looking anymore
-
- if ((total >= 0) && ((total - available) < 5))
- return E_FILE_FORMAT_INVALID;
-
- return available + 5; // 5 = 4-byte ID + 1st byte of size
- }
-
- if ((total >= 0) && ((total - pos) < 5))
- return E_FILE_FORMAT_INVALID;
-
- if ((available - pos) < 5)
- return pos + 5; // try again later
-
- long len;
-
- const long long result = ReadUInt(pReader, pos, len);
-
- if (result < 0) // error
- return result;
-
- if (result == 0x0A45DFA3) { // EBML Header ID
- pos += len; // consume ID
+ if (scan_byte == kEbmlByte0)
break;
- }
- ++pos; // throw away just the 0x1A byte, and try again
+ ++pos;
}
- // pos designates start of size field
+ long len = 0;
+ const long long ebml_id = ReadID(pReader, pos, len);
- // get length of size field
+ // TODO(tomfinegan): Move Matroska ID constants into a common namespace.
+ if (len != 4 || ebml_id != mkvmuxer::kMkvEBML)
+ return E_FILE_FORMAT_INVALID;
- long len;
+ // Move read pos forward to the EBML header size field.
+ pos += 4;
+
+ // Read length of size field.
long long result = GetUIntLength(pReader, pos, len);
if (result < 0) // error
- return result;
+ return E_FILE_FORMAT_INVALID;
+ else if (result > 0) // need more data
+ return E_BUFFER_NOT_FULL;
- if (result > 0) // need more data
- return result;
-
- assert(len > 0);
- assert(len <= 8);
+ if (len < 1 || len > 8)
+ return E_FILE_FORMAT_INVALID;
if ((total >= 0) && ((total - pos) < len))
return E_FILE_FORMAT_INVALID;
@@ -513,8 +553,7 @@
if ((available - pos) < len)
return pos + len; // try again later
- // get the EBML header size
-
+ // Read the EBML header size.
result = ReadUInt(pReader, pos, len);
if (result < 0) // error
@@ -542,30 +581,30 @@
if (status < 0) // error
return status;
- if (size == 0) // weird
+ if (size == 0)
return E_FILE_FORMAT_INVALID;
- if (id == 0x0286) { // version
+ if (id == mkvmuxer::kMkvEBMLVersion) {
m_version = UnserializeUInt(pReader, pos, size);
if (m_version <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x02F7) { // read version
+ } else if (id == mkvmuxer::kMkvEBMLReadVersion) {
m_readVersion = UnserializeUInt(pReader, pos, size);
if (m_readVersion <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x02F2) { // max id length
+ } else if (id == mkvmuxer::kMkvEBMLMaxIDLength) {
m_maxIdLength = UnserializeUInt(pReader, pos, size);
if (m_maxIdLength <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x02F3) { // max size length
+ } else if (id == mkvmuxer::kMkvEBMLMaxSizeLength) {
m_maxSizeLength = UnserializeUInt(pReader, pos, size);
if (m_maxSizeLength <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0282) { // doctype
+ } else if (id == mkvmuxer::kMkvDocType) {
if (m_docType)
return E_FILE_FORMAT_INVALID;
@@ -573,12 +612,12 @@
if (status) // error
return status;
- } else if (id == 0x0287) { // doctype version
+ } else if (id == mkvmuxer::kMkvDocTypeVersion) {
m_docTypeVersion = UnserializeUInt(pReader, pos, size);
if (m_docTypeVersion <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0285) { // doctype read version
+ } else if (id == mkvmuxer::kMkvDocTypeReadVersion) {
m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
if (m_docTypeReadVersion <= 0)
@@ -588,7 +627,18 @@
pos += size;
}
- assert(pos == end);
+ if (pos != end)
+ return E_FILE_FORMAT_INVALID;
+
+ // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid.
+ if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid.
+ if (m_maxIdLength <= 0 || m_maxIdLength > 4 ||
+ m_maxSizeLength <= 0 || m_maxSizeLength > 8)
+ return E_FILE_FORMAT_INVALID;
+
return 0;
}
@@ -621,8 +671,6 @@
while (i != j) {
Cluster* const p = *i++;
- assert(p);
-
delete p;
}
@@ -638,8 +686,8 @@
long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
Segment*& pSegment) {
- assert(pReader);
- assert(pos >= 0);
+ if (pReader == NULL || pos < 0)
+ return E_PARSE_FAILED;
pSegment = NULL;
@@ -691,10 +739,10 @@
return pos + len;
const long long idpos = pos;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
- if (id < 0) // error
- return id;
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
pos += len; // consume ID
@@ -723,7 +771,7 @@
// Handle "unknown size" for live streaming of webm files.
const long long unknown_size = (1LL << (7 * len)) - 1;
- if (id == 0x08538067) { // Segment ID
+ if (id == mkvmuxer::kMkvSegment) {
if (size == unknown_size)
size = -1;
@@ -733,12 +781,9 @@
else if ((pos + size) > total)
size = -1;
- pSegment = new (std::nothrow) Segment(pReader, idpos,
- // elem_size
- pos, size);
-
- if (pSegment == 0)
- return -1; // generic error
+ pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size);
+ if (pSegment == NULL)
+ return E_PARSE_FAILED;
return 0; // success
}
@@ -767,11 +812,15 @@
if (status < 0) // error
return status;
- assert((total < 0) || (available <= total));
+ if (total > 0 && available > total)
+ return E_FILE_FORMAT_INVALID;
const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
- assert((segment_stop < 0) || (total < 0) || (segment_stop <= total));
- assert((segment_stop < 0) || (m_pos <= segment_stop));
+
+ if ((segment_stop >= 0 && total >= 0 && segment_stop > total) ||
+ (segment_stop >= 0 && m_pos > segment_stop)) {
+ return E_FILE_FORMAT_INVALID;
+ }
for (;;) {
if ((total >= 0) && (m_pos >= total))
@@ -783,6 +832,11 @@
long long pos = m_pos;
const long long element_start = pos;
+ // Avoid rolling over pos when very close to LONG_LONG_MAX.
+ unsigned long long rollover_check = pos + 1ULL;
+ if (rollover_check > LONG_LONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
if ((pos + 1) > available)
return (pos + 1);
@@ -792,8 +846,10 @@
if (result < 0) // error
return result;
- if (result > 0) // underflow (weird)
+ if (result > 0) {
+ // MkvReader doesn't have enough data to satisfy this read attempt.
return (pos + 1);
+ }
if ((segment_stop >= 0) && ((pos + len) > segment_stop))
return E_FILE_FORMAT_INVALID;
@@ -802,12 +858,12 @@
return pos + len;
const long long idpos = pos;
- const long long id = ReadUInt(m_pReader, idpos, len);
+ const long long id = ReadID(m_pReader, idpos, len);
- if (id < 0) // error
- return id;
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
- if (id == 0x0F43B675) // Cluster ID
+ if (id == mkvmuxer::kMkvCluster)
break;
pos += len; // consume ID
@@ -821,8 +877,10 @@
if (result < 0) // error
return result;
- if (result > 0) // underflow (weird)
+ if (result > 0) {
+ // MkvReader doesn't have enough data to satisfy this read attempt.
return (pos + 1);
+ }
if ((segment_stop >= 0) && ((pos + len) > segment_stop))
return E_FILE_FORMAT_INVALID;
@@ -832,11 +890,19 @@
const long long size = ReadUInt(m_pReader, pos, len);
- if (size < 0) // error
+ if (size < 0 || len < 1 || len > 8) {
+ // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or
+ // len > 8 is true instead of checking this _everywhere_.
return size;
+ }
pos += len; // consume length of size of element
+ // Avoid rolling over pos when very close to LONG_LONG_MAX.
+ rollover_check = static_cast<unsigned long long>(pos) + size;
+ if (rollover_check > LONG_LONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
const long long element_size = size + pos - element_start;
// Pos now points to start of payload
@@ -849,7 +915,7 @@
if ((pos + size) > available)
return pos + size;
- if (id == 0x0549A966) { // Segment Info ID
+ if (id == mkvmuxer::kMkvInfo) {
if (m_pInfo)
return E_FILE_FORMAT_INVALID;
@@ -863,7 +929,7 @@
if (status)
return status;
- } else if (id == 0x0654AE6B) { // Tracks ID
+ } else if (id == mkvmuxer::kMkvTracks) {
if (m_pTracks)
return E_FILE_FORMAT_INVALID;
@@ -877,7 +943,7 @@
if (status)
return status;
- } else if (id == 0x0C53BB6B) { // Cues ID
+ } else if (id == mkvmuxer::kMkvCues) {
if (m_pCues == NULL) {
m_pCues = new (std::nothrow)
Cues(this, pos, size, element_start, element_size);
@@ -885,7 +951,7 @@
if (m_pCues == NULL)
return -1;
}
- } else if (id == 0x014D9B74) { // SeekHead ID
+ } else if (id == mkvmuxer::kMkvSeekHead) {
if (m_pSeekHead == NULL) {
m_pSeekHead = new (std::nothrow)
SeekHead(this, pos, size, element_start, element_size);
@@ -898,7 +964,7 @@
if (status)
return status;
}
- } else if (id == 0x0043A770) { // Chapters ID
+ } else if (id == mkvmuxer::kMkvChapters) {
if (m_pChapters == NULL) {
m_pChapters = new (std::nothrow)
Chapters(this, pos, size, element_start, element_size);
@@ -911,7 +977,7 @@
if (status)
return status;
}
- } else if (id == 0x0254C367) { // Tags ID
+ } else if (id == mkvmuxer::kMkvTags) {
if (m_pTags == NULL) {
m_pTags = new (std::nothrow)
Tags(this, pos, size, element_start, element_size);
@@ -929,7 +995,8 @@
m_pos = pos + size; // consume payload
}
- assert((segment_stop < 0) || (m_pos <= segment_stop));
+ if (segment_stop >= 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
if (m_pInfo == NULL) // TODO: liberalize this behavior
return E_FILE_FORMAT_INVALID;
@@ -960,7 +1027,8 @@
if (status < 0) // error
return status;
- assert((total < 0) || (avail <= total));
+ if (total >= 0 && avail > total)
+ return E_FILE_FORMAT_INVALID;
const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
@@ -988,7 +1056,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((segment_stop >= 0) && ((pos + len) > segment_stop))
@@ -998,10 +1066,10 @@
return E_BUFFER_NOT_FULL;
const long long idpos = pos;
- const long long id = ReadUInt(m_pReader, idpos, len);
+ const long long id = ReadID(m_pReader, idpos, len);
- if (id < 0) // error (or underflow)
- return static_cast<long>(id);
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
pos += len; // consume ID
@@ -1017,7 +1085,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((segment_stop >= 0) && ((pos + len) > segment_stop))
@@ -1035,7 +1103,8 @@
// pos now points to start of payload
- if (size == 0) { // weird
+ if (size == 0) {
+ // Missing element payload: move on.
m_pos = pos;
continue;
}
@@ -1047,24 +1116,30 @@
return E_FILE_FORMAT_INVALID;
}
- if (id == 0x0C53BB6B) { // Cues ID
- if (size == unknown_size)
- return E_FILE_FORMAT_INVALID; // TODO: liberalize
+ if (id == mkvmuxer::kMkvCues) {
+ if (size == unknown_size) {
+ // Cues element of unknown size: Not supported.
+ return E_FILE_FORMAT_INVALID;
+ }
if (m_pCues == NULL) {
const long long element_size = (pos - idpos) + size;
- m_pCues = new Cues(this, pos, size, idpos, element_size);
- assert(m_pCues); // TODO
+ m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size);
+ if (m_pCues == NULL)
+ return -1;
}
m_pos = pos + size; // consume payload
continue;
}
- if (id != 0x0F43B675) { // Cluster ID
+ if (id != mkvmuxer::kMkvCluster) {
+ // Besides the Segment, Libwebm allows only cluster elements of unknown
+ // size. Fail the parse upon encountering a non-cluster element reporting
+ // unknown size.
if (size == unknown_size)
- return E_FILE_FORMAT_INVALID; // TODO: liberalize
+ return E_FILE_FORMAT_INVALID;
m_pos = pos + size; // consume payload
continue;
@@ -1080,7 +1155,10 @@
break;
}
- assert(cluster_off >= 0); // have cluster
+ if (cluster_off < 0) {
+ // No cluster, die.
+ return E_FILE_FORMAT_INVALID;
+ }
long long pos_;
long len_;
@@ -1126,14 +1204,16 @@
const long idx = m_clusterCount;
if (m_clusterPreloadCount > 0) {
- assert(idx < m_clusterSize);
+ if (idx >= m_clusterSize)
+ return E_FILE_FORMAT_INVALID;
Cluster* const pCluster = m_clusters[idx];
- assert(pCluster);
- assert(pCluster->m_index < 0);
+ if (pCluster == NULL || pCluster->m_index >= 0)
+ return E_FILE_FORMAT_INVALID;
const long long off = pCluster->GetPosition();
- assert(off >= 0);
+ if (off < 0)
+ return E_FILE_FORMAT_INVALID;
if (off == cluster_off) { // preloaded already
if (status == 0) // no entries found
@@ -1155,7 +1235,8 @@
--m_clusterPreloadCount;
m_pos = pos; // consume payload
- assert((segment_stop < 0) || (m_pos <= segment_stop));
+ if (segment_stop >= 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
return 0; // success
}
@@ -1182,19 +1263,21 @@
// status > 0 means we have an entry
Cluster* const pCluster = Cluster::Create(this, idx, cluster_off);
- // element_size);
- assert(pCluster);
+ if (pCluster == NULL)
+ return -1;
- AppendCluster(pCluster);
- assert(m_clusters);
- assert(idx < m_clusterSize);
- assert(m_clusters[idx] == pCluster);
+ if (!AppendCluster(pCluster)) {
+ delete pCluster;
+ return -1;
+ }
if (cluster_size >= 0) {
pos += cluster_size;
m_pos = pos;
- assert((segment_stop < 0) || (m_pos <= segment_stop));
+
+ if (segment_stop > 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -1210,8 +1293,8 @@
}
long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
- assert(m_pos < 0);
- assert(m_pUnknownSize);
+ if (m_pos >= 0 || m_pUnknownSize == NULL)
+ return E_PARSE_FAILED;
const long status = m_pUnknownSize->Parse(pos, len);
@@ -1221,12 +1304,11 @@
if (status == 0) // parsed a block
return 2; // continue parsing
- assert(status > 0); // nothing left to parse of this cluster
-
const long long start = m_pUnknownSize->m_element_start;
-
const long long size = m_pUnknownSize->GetElementSize();
- assert(size >= 0);
+
+ if (size < 0)
+ return E_FILE_FORMAT_INVALID;
pos = start + size;
m_pos = pos;
@@ -1236,24 +1318,26 @@
return 2; // continue parsing
}
-void Segment::AppendCluster(Cluster* pCluster) {
- assert(pCluster);
- assert(pCluster->m_index >= 0);
+bool Segment::AppendCluster(Cluster* pCluster) {
+ if (pCluster == NULL || pCluster->m_index < 0)
+ return false;
const long count = m_clusterCount + m_clusterPreloadCount;
long& size = m_clusterSize;
- assert(size >= count);
-
const long idx = pCluster->m_index;
- assert(idx == m_clusterCount);
+
+ if (size < count || idx != m_clusterCount)
+ return false;
if (count >= size) {
const long n = (size <= 0) ? 2048 : 2 * size;
- Cluster** const qq = new Cluster*[n];
- Cluster** q = qq;
+ Cluster** const qq = new (std::nothrow) Cluster*[n];
+ if (qq == NULL)
+ return false;
+ Cluster** q = qq;
Cluster** p = m_clusters;
Cluster** const pp = p + count;
@@ -1267,18 +1351,18 @@
}
if (m_clusterPreloadCount > 0) {
- assert(m_clusters);
-
Cluster** const p = m_clusters + m_clusterCount;
- assert(*p);
- assert((*p)->m_index < 0);
+ if (*p == NULL || (*p)->m_index >= 0)
+ return false;
Cluster** q = p + m_clusterPreloadCount;
- assert(q < (m_clusters + size));
+ if (q >= (m_clusters + size))
+ return false;
for (;;) {
Cluster** const qq = q - 1;
- assert((*qq)->m_index < 0);
+ if ((*qq)->m_index >= 0)
+ return false;
*q = *qq;
q = qq;
@@ -1290,22 +1374,25 @@
m_clusters[idx] = pCluster;
++m_clusterCount;
+ return true;
}
-void Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
- assert(pCluster);
- assert(pCluster->m_index < 0);
- assert(idx >= m_clusterCount);
+bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
+ if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount)
+ return false;
const long count = m_clusterCount + m_clusterPreloadCount;
long& size = m_clusterSize;
- assert(size >= count);
+ if (size < count)
+ return false;
if (count >= size) {
const long n = (size <= 0) ? 2048 : 2 * size;
- Cluster** const qq = new Cluster*[n];
+ Cluster** const qq = new (std::nothrow) Cluster*[n];
+ if (qq == NULL)
+ return false;
Cluster** q = qq;
Cluster** p = m_clusters;
@@ -1320,17 +1407,20 @@
size = n;
}
- assert(m_clusters);
+ if (m_clusters == NULL)
+ return false;
Cluster** const p = m_clusters + idx;
Cluster** q = m_clusters + count;
- assert(q >= p);
- assert(q < (m_clusters + size));
+ if (q < p || q >= (m_clusters + size))
+ return false;
while (q > p) {
Cluster** const qq = q - 1;
- assert((*qq)->m_index < 0);
+
+ if ((*qq)->m_index >= 0)
+ return false;
*q = *qq;
q = qq;
@@ -1338,13 +1428,12 @@
m_clusters[idx] = pCluster;
++m_clusterPreloadCount;
+ return true;
}
long Segment::Load() {
- assert(m_clusters == NULL);
- assert(m_clusterSize == 0);
- assert(m_clusterCount == 0);
- // assert(m_size >= 0);
+ if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0)
+ return E_PARSE_FAILED;
// Outermost (level 0) segment object has been constructed,
// and pos designates start of payload. We need to find the
@@ -1358,8 +1447,8 @@
if (header_status > 0) // underflow
return E_BUFFER_NOT_FULL;
- assert(m_pInfo);
- assert(m_pTracks);
+ if (m_pInfo == NULL || m_pTracks == NULL)
+ return E_FILE_FORMAT_INVALID;
for (;;) {
const int status = LoadCluster();
@@ -1408,16 +1497,19 @@
if (status < 0) // error
return status;
- if (id == 0x0DBB) // SeekEntry ID
+ if (id == mkvmuxer::kMkvSeek)
++entry_count;
- else if (id == 0x6C) // Void ID
+ else if (id == mkvmuxer::kMkvVoid)
++void_element_count;
pos += size; // consume payload
- assert(pos <= stop);
+
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
m_entries = new (std::nothrow) Entry[entry_count];
@@ -1446,14 +1538,14 @@
if (status < 0) // error
return status;
- if (id == 0x0DBB) { // SeekEntry ID
+ if (id == mkvmuxer::kMkvSeek) {
if (ParseEntry(pReader, pos, size, pEntry)) {
Entry& e = *pEntry++;
e.element_start = idpos;
e.element_size = (pos + size) - idpos;
}
- } else if (id == 0x6C) { // Void ID
+ } else if (id == mkvmuxer::kMkvVoid) {
VoidElement& e = *pVoidElement++;
e.element_start = idpos;
@@ -1461,10 +1553,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
assert(count_ >= 0);
@@ -1553,9 +1647,9 @@
const long long idpos = pos;
- const long long id = ReadUInt(m_pReader, idpos, len);
+ const long long id = ReadID(m_pReader, idpos, len);
- if (id != 0x0C53BB6B) // Cues ID
+ if (id != mkvmuxer::kMkvCues)
return E_FILE_FORMAT_INVALID;
pos += len; // consume ID
@@ -1615,7 +1709,8 @@
m_pCues =
new (std::nothrow) Cues(this, pos, size, element_start, element_size);
- assert(m_pCues); // TODO
+ if (m_pCues == NULL)
+ return -1;
return 0; // success
}
@@ -1632,10 +1727,11 @@
// parse the container for the level-1 element ID
- const long long seekIdId = ReadUInt(pReader, pos, len);
- // seekIdId;
+ const long long seekIdId = ReadID(pReader, pos, len);
+ if (seekIdId < 0)
+ return false;
- if (seekIdId != 0x13AB) // SeekID ID
+ if (seekIdId != mkvmuxer::kMkvSeekID)
return false;
if ((pos + len) > stop)
@@ -1677,9 +1773,9 @@
pos += seekIdSize; // consume SeekID payload
- const long long seekPosId = ReadUInt(pReader, pos, len);
+ const long long seekPosId = ReadID(pReader, pos, len);
- if (seekPosId != 0x13AC) // SeekPos ID
+ if (seekPosId != mkvmuxer::kMkvSeekPosition)
return false;
if ((pos + len) > stop)
@@ -1757,8 +1853,8 @@
if (m_cue_points)
return true;
- assert(m_count == 0);
- assert(m_preload_count == 0);
+ if (m_count != 0 || m_preload_count != 0)
+ return false;
IMkvReader* const pReader = m_pSegment->m_pReader;
@@ -1772,7 +1868,7 @@
long len;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0 || (pos + len) > stop) {
return false;
}
@@ -1789,21 +1885,27 @@
return false;
}
- if (id == 0x3B) // CuePoint ID
- PreloadCuePoint(cue_points_size, idpos);
+ if (id == mkvmuxer::kMkvCuePoint) {
+ if (!PreloadCuePoint(cue_points_size, idpos))
+ return false;
+ }
pos += size; // skip payload
}
return true;
}
-void Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
- assert(m_count == 0);
+bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
+ if (m_count != 0)
+ return false;
if (m_preload_count >= cue_points_size) {
const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
- CuePoint** const qq = new CuePoint*[n];
+ CuePoint** const qq = new (std::nothrow) CuePoint*[n];
+ if (qq == NULL)
+ return false;
+
CuePoint** q = qq; // beginning of target
CuePoint** p = m_cue_points; // beginning of source
@@ -1818,14 +1920,15 @@
cue_points_size = n;
}
- CuePoint* const pCP = new CuePoint(m_preload_count, pos);
+ CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos);
+ if (pCP == NULL)
+ return false;
+
m_cue_points[m_preload_count++] = pCP;
+ return true;
}
bool Cues::LoadCuePoint() const {
- // odbgstream os;
- // os << "Cues::LoadCuePoint" << endl;
-
const long long stop = m_start + m_size;
if (m_pos >= stop)
@@ -1843,32 +1946,33 @@
long len;
- const long long id = ReadUInt(pReader, m_pos, len);
- assert(id >= 0); // TODO
- assert((m_pos + len) <= stop);
+ const long long id = ReadID(pReader, m_pos, len);
+ if (id < 0 || (m_pos + len) > stop)
+ return false;
m_pos += len; // consume ID
const long long size = ReadUInt(pReader, m_pos, len);
- assert(size >= 0);
- assert((m_pos + len) <= stop);
+ if (size < 0 || (m_pos + len) > stop)
+ return false;
m_pos += len; // consume Size field
- assert((m_pos + size) <= stop);
+ if ((m_pos + size) > stop)
+ return false;
- if (id != 0x3B) { // CuePoint ID
+ if (id != mkvmuxer::kMkvCuePoint) {
m_pos += size; // consume payload
- assert(m_pos <= stop);
+ if (m_pos > stop)
+ return false;
continue;
}
- assert(m_preload_count > 0);
+ if (m_preload_count < 1)
+ return false;
CuePoint* const pCP = m_cue_points[m_count];
- assert(pCP);
- assert((pCP->GetTimeCode() >= 0) || (-pCP->GetTimeCode() == idpos));
- if (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos))
+ if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos)))
return false;
if (!pCP->Load(pReader)) {
@@ -1879,24 +1983,18 @@
--m_preload_count;
m_pos += size; // consume payload
- assert(m_pos <= stop);
+ if (m_pos > stop)
+ return false;
return true; // yes, we loaded a cue point
}
- // return (m_pos < stop);
return false; // no, we did not load a cue point
}
bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
const CuePoint::TrackPosition*& pTP) const {
- assert(time_ns >= 0);
- assert(pTrack);
-
- if (m_cue_points == NULL)
- return false;
-
- if (m_count == 0)
+ if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0)
return false;
CuePoint** const ii = m_cue_points;
@@ -1906,7 +2004,8 @@
CuePoint** j = jj;
pCP = *i;
- assert(pCP);
+ if (pCP == NULL)
+ return false;
if (time_ns <= pCP->GetTime(m_pSegment)) {
pTP = pCP->Find(pTrack);
@@ -1920,10 +2019,12 @@
//[j, jj) > time_ns
CuePoint** const k = i + (j - i) / 2;
- assert(k < jj);
+ if (k >= jj)
+ return false;
CuePoint* const pCP = *k;
- assert(pCP);
+ if (pCP == NULL)
+ return false;
const long long t = pCP->GetTime(m_pSegment);
@@ -1932,16 +2033,17 @@
else
j = k;
- assert(i <= j);
+ if (i > j)
+ return false;
}
- assert(i == j);
- assert(i <= jj);
- assert(i > ii);
+ if (i != j || i > jj || i <= ii)
+ return false;
pCP = *--i;
- assert(pCP);
- assert(pCP->GetTime(m_pSegment) <= time_ns);
+
+ if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns)
+ return false;
// TODO: here and elsewhere, it's probably not correct to search
// for the cue point with this time, and then search for a matching
@@ -1956,55 +2058,50 @@
}
const CuePoint* Cues::GetFirst() const {
- if (m_cue_points == NULL)
- return NULL;
-
- if (m_count == 0)
+ if (m_cue_points == NULL || m_count == 0)
return NULL;
CuePoint* const* const pp = m_cue_points;
- assert(pp);
+ if (pp == NULL)
+ return NULL;
CuePoint* const pCP = pp[0];
- assert(pCP);
- assert(pCP->GetTimeCode() >= 0);
+ if (pCP == NULL || pCP->GetTimeCode() < 0)
+ return NULL;
return pCP;
}
const CuePoint* Cues::GetLast() const {
- if (m_cue_points == NULL)
- return NULL;
-
- if (m_count <= 0)
+ if (m_cue_points == NULL || m_count <= 0)
return NULL;
const long index = m_count - 1;
CuePoint* const* const pp = m_cue_points;
- assert(pp);
+ if (pp == NULL)
+ return NULL;
CuePoint* const pCP = pp[index];
- assert(pCP);
- assert(pCP->GetTimeCode() >= 0);
+ if (pCP == NULL || pCP->GetTimeCode() < 0)
+ return NULL;
return pCP;
}
const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
- if (pCurr == NULL)
+ if (pCurr == NULL || pCurr->GetTimeCode() < 0 ||
+ m_cue_points == NULL || m_count < 1) {
return NULL;
-
- assert(pCurr->GetTimeCode() >= 0);
- assert(m_cue_points);
- assert(m_count >= 1);
+ }
long index = pCurr->m_index;
- assert(index < m_count);
+ if (index >= m_count)
+ return NULL;
CuePoint* const* const pp = m_cue_points;
- assert(pp);
- assert(pp[index] == pCurr);
+ if (pp == NULL || pp[index] != pCurr)
+ return NULL;
++index;
@@ -2012,18 +2109,16 @@
return NULL;
CuePoint* const pNext = pp[index];
- assert(pNext);
- assert(pNext->GetTimeCode() >= 0);
+
+ if (pNext == NULL || pNext->GetTimeCode() < 0)
+ return NULL;
return pNext;
}
const BlockEntry* Cues::GetBlock(const CuePoint* pCP,
const CuePoint::TrackPosition* pTP) const {
- if (pCP == NULL)
- return NULL;
-
- if (pTP == NULL)
+ if (pCP == NULL || pTP == NULL)
return NULL;
return m_pSegment->GetBlock(*pCP, *pTP);
@@ -2070,11 +2165,15 @@
// assert(Cluster::HasBlockEntries(this, tp.m_pos));
Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos); //, -1);
- assert(pCluster);
+ if (pCluster == NULL)
+ return NULL;
const ptrdiff_t idx = i - m_clusters;
- PreloadCluster(pCluster, idx);
+ if (!PreloadCluster(pCluster, idx)) {
+ delete pCluster;
+ return NULL;
+ }
assert(m_clusters);
assert(m_clusterPreloadCount > 0);
assert(m_clusters[idx] == pCluster);
@@ -2125,12 +2224,15 @@
// assert(Cluster::HasBlockEntries(this, tp.m_pos));
Cluster* const pCluster = Cluster::Create(this, -1, requested_pos);
- //-1);
- assert(pCluster);
+ if (pCluster == NULL)
+ return NULL;
const ptrdiff_t idx = i - m_clusters;
- PreloadCluster(pCluster, idx);
+ if (!PreloadCluster(pCluster, idx)) {
+ delete pCluster;
+ return NULL;
+ }
assert(m_clusters);
assert(m_clusterPreloadCount > 0);
assert(m_clusters[idx] == pCluster);
@@ -2168,9 +2270,8 @@
{
long len;
- const long long id = ReadUInt(pReader, pos_, len);
- assert(id == 0x3B); // CuePoint ID
- if (id != 0x3B)
+ const long long id = ReadID(pReader, pos_, len);
+ if (id != mkvmuxer::kMkvCuePoint)
return false;
pos_ += len; // consume ID
@@ -2193,7 +2294,7 @@
while (pos < stop) {
long len;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if ((id < 0) || (pos + len > stop)) {
return false;
}
@@ -2210,10 +2311,10 @@
return false;
}
- if (id == 0x33) // CueTime ID
+ if (id == mkvmuxer::kMkvCueTime)
m_timecode = UnserializeUInt(pReader, pos, size);
- else if (id == 0x37) // CueTrackPosition(s) ID
+ else if (id == mkvmuxer::kMkvCueTrackPositions)
++m_track_positions_count;
pos += size; // consume payload
@@ -2227,7 +2328,9 @@
// << " timecode=" << m_timecode
// << endl;
- m_track_positions = new TrackPosition[m_track_positions_count];
+ m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count];
+ if (m_track_positions == NULL)
+ return false;
// Now parse track positions
@@ -2237,9 +2340,9 @@
while (pos < stop) {
long len;
- const long long id = ReadUInt(pReader, pos, len);
- assert(id >= 0);
- assert((pos + len) <= stop);
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (pos + len) > stop)
+ return false;
pos += len; // consume ID
@@ -2250,7 +2353,7 @@
pos += len; // consume Size field
assert((pos + size) <= stop);
- if (id == 0x37) { // CueTrackPosition(s) ID
+ if (id == mkvmuxer::kMkvCueTrackPositions) {
TrackPosition& tp = *p++;
if (!tp.Parse(pReader, pos, size)) {
return false;
@@ -2258,7 +2361,8 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return false;
}
assert(size_t(p - m_track_positions) == m_track_positions_count);
@@ -2281,7 +2385,7 @@
while (pos < stop) {
long len;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if ((id < 0) || ((pos + len) > stop)) {
return false;
}
@@ -2298,13 +2402,11 @@
return false;
}
- if (id == 0x77) // CueTrack ID
+ if (id == mkvmuxer::kMkvCueTrack)
m_track = UnserializeUInt(pReader, pos, size);
-
- else if (id == 0x71) // CueClusterPos ID
+ else if (id == mkvmuxer::kMkvCueClusterPosition)
m_pos = UnserializeUInt(pReader, pos, size);
-
- else if (id == 0x1378) // CueBlockNumber
+ else if (id == mkvmuxer::kMkvCueBlockNumber)
m_block = UnserializeUInt(pReader, pos, size);
pos += size; // consume payload
@@ -2437,9 +2539,8 @@
if (result != 0)
return NULL;
- const long long id = ReadUInt(m_pReader, pos, len);
- assert(id == 0x0F43B675); // Cluster ID
- if (id != 0x0F43B675)
+ const long long id = ReadID(m_pReader, pos, len);
+ if (id != mkvmuxer::kMkvCluster)
return NULL;
pos += len; // consume ID
@@ -2474,8 +2575,9 @@
const long long idpos = pos; // pos of next (potential) cluster
- const long long id = ReadUInt(m_pReader, idpos, len);
- assert(id > 0); // TODO
+ const long long id = ReadID(m_pReader, idpos, len);
+ if (id < 0)
+ return NULL;
pos += len; // consume ID
@@ -2495,7 +2597,7 @@
if (size == 0) // weird
continue;
- if (id == 0x0F43B675) { // Cluster ID
+ if (id == mkvmuxer::kMkvCluster) {
const long long off_next_ = idpos - m_start;
long long pos_;
@@ -2553,11 +2655,15 @@
assert(i == j);
Cluster* const pNext = Cluster::Create(this, -1, off_next);
- assert(pNext);
+ if (pNext == NULL)
+ return NULL;
const ptrdiff_t idx_next = i - m_clusters; // insertion position
- PreloadCluster(pNext, idx_next);
+ if (!PreloadCluster(pNext, idx_next)) {
+ delete pNext;
+ return NULL;
+ }
assert(m_clusters);
assert(idx_next < m_clusterSize);
assert(m_clusters[idx_next] == pNext);
@@ -2641,7 +2747,7 @@
const long long id = ReadUInt(m_pReader, pos, len);
- if (id != 0x0F43B675) // weird: not Cluster ID
+ if (id != mkvmuxer::kMkvCluster)
return -1;
pos += len; // consume ID
@@ -2687,7 +2793,8 @@
// Pos now points to start of payload
pos += size; // consume payload (that is, the current cluster)
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
// By consuming the payload, we are assuming that the curr
// cluster isn't interesting. That is, we don't bother checking
@@ -2755,7 +2862,7 @@
const long long idpos = pos; // absolute
const long long idoff = pos - m_start; // relative
- const long long id = ReadUInt(m_pReader, idpos, len); // absolute
+ const long long id = ReadID(m_pReader, idpos, len); // absolute
if (id < 0) // error
return static_cast<long>(id);
@@ -2805,7 +2912,7 @@
return E_FILE_FORMAT_INVALID;
}
- if (id == 0x0C53BB6B) { // Cues ID
+ if (id == mkvmuxer::kMkvCues) {
if (size == unknown_size)
return E_FILE_FORMAT_INVALID;
@@ -2818,22 +2925,26 @@
const long long element_size = element_stop - element_start;
if (m_pCues == NULL) {
- m_pCues = new Cues(this, pos, size, element_start, element_size);
- assert(m_pCues); // TODO
+ m_pCues = new (std::nothrow)
+ Cues(this, pos, size, element_start, element_size);
+ if (m_pCues == NULL)
+ return false;
}
pos += size; // consume payload
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
continue;
}
- if (id != 0x0F43B675) { // not a Cluster ID
+ if (id != mkvmuxer::kMkvCluster) { // not a Cluster ID
if (size == unknown_size)
return E_FILE_FORMAT_INVALID;
pos += size; // consume payload
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
continue;
}
@@ -2905,12 +3016,15 @@
Cluster* const pNext = Cluster::Create(this,
-1, // preloaded
off_next);
- // element_size);
- assert(pNext);
+ if (pNext == NULL)
+ return -1;
const ptrdiff_t idx_next = i - m_clusters; // insertion position
- PreloadCluster(pNext, idx_next);
+ if (!PreloadCluster(pNext, idx_next)) {
+ delete pNext;
+ return -1;
+ }
assert(m_clusters);
assert(idx_next < m_clusterSize);
assert(m_clusters[idx_next] == pNext);
@@ -2953,7 +3067,7 @@
return E_BUFFER_NOT_FULL;
const long long idpos = pos;
- const long long id = ReadUInt(m_pReader, idpos, len);
+ const long long id = ReadID(m_pReader, idpos, len);
if (id < 0) // error (or underflow)
return static_cast<long>(id);
@@ -2962,10 +3076,7 @@
// that we have exhausted the sub-element's inside the cluster
// whose ID we parsed earlier.
- if (id == 0x0F43B675) // Cluster ID
- break;
-
- if (id == 0x0C53BB6B) // Cues ID
+ if (id == mkvmuxer::kMkvCluster || id == mkvmuxer::kMkvCues)
break;
pos += len; // consume ID (of sub-element)
@@ -3012,7 +3123,8 @@
return E_FILE_FORMAT_INVALID;
pos += size; // consume payload of sub-element
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
} // determine cluster size
cluster_size = pos - payload_pos;
@@ -3022,7 +3134,8 @@
}
pos += cluster_size; // consume payload
- assert((segment_stop < 0) || (pos <= segment_stop));
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
return 2; // try to find a cluster that follows next
}
@@ -3131,7 +3244,7 @@
if (size == 0) // weird
continue;
- if (id == 0x05B9) { // EditionEntry ID
+ if (id == mkvmuxer::kMkvEditionEntry) {
status = ParseEdition(pos, size);
if (status < 0) // error
@@ -3139,10 +3252,12 @@
}
pos += size;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3242,10 +3357,10 @@
if (status < 0) // error
return status;
- if (size == 0) // weird
+ if (size == 0)
continue;
- if (id == 0x36) { // Atom ID
+ if (id == mkvmuxer::kMkvChapterAtom) {
status = ParseAtom(pReader, pos, size);
if (status < 0) // error
@@ -3253,10 +3368,12 @@
}
pos += size;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3373,20 +3490,20 @@
if (status < 0) // error
return status;
- if (size == 0) // weird
+ if (size == 0) // 0 length payload, skip.
continue;
- if (id == 0x00) { // Display ID
+ if (id == mkvmuxer::kMkvChapterDisplay) {
status = ParseDisplay(pReader, pos, size);
if (status < 0) // error
return status;
- } else if (id == 0x1654) { // StringUID ID
+ } else if (id == mkvmuxer::kMkvChapterStringUID) {
status = UnserializeString(pReader, pos, size, m_string_uid);
if (status < 0) // error
return status;
- } else if (id == 0x33C4) { // UID ID
+ } else if (id == mkvmuxer::kMkvChapterUID) {
long long val;
status = UnserializeInt(pReader, pos, size, val);
@@ -3394,14 +3511,14 @@
return status;
m_uid = static_cast<unsigned long long>(val);
- } else if (id == 0x11) { // TimeStart ID
+ } else if (id == mkvmuxer::kMkvChapterTimeStart) {
const long long val = UnserializeUInt(pReader, pos, size);
if (val < 0) // error
return static_cast<long>(val);
m_start_timecode = val;
- } else if (id == 0x12) { // TimeEnd ID
+ } else if (id == mkvmuxer::kMkvChapterTimeEnd) {
const long long val = UnserializeUInt(pReader, pos, size);
if (val < 0) // error
@@ -3411,10 +3528,12 @@
}
pos += size;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3524,20 +3643,20 @@
if (status < 0) // error
return status;
- if (size == 0) // weird
+ if (size == 0) // No payload.
continue;
- if (id == 0x05) { // ChapterString ID
+ if (id == mkvmuxer::kMkvChapString) {
status = UnserializeString(pReader, pos, size, m_string);
if (status)
return status;
- } else if (id == 0x037C) { // ChapterLanguage ID
+ } else if (id == mkvmuxer::kMkvChapLanguage) {
status = UnserializeString(pReader, pos, size, m_language);
if (status)
return status;
- } else if (id == 0x037E) { // ChapterCountry ID
+ } else if (id == mkvmuxer::kMkvChapCountry) {
status = UnserializeString(pReader, pos, size, m_country);
if (status)
@@ -3545,10 +3664,12 @@
}
pos += size;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3588,7 +3709,7 @@
if (size == 0) // 0 length tag, read another
continue;
- if (id == 0x3373) { // Tag ID
+ if (id == mkvmuxer::kMkvTag) {
status = ParseTag(pos, size);
if (status < 0)
@@ -3596,14 +3717,12 @@
}
pos += size;
- assert(pos <= stop);
if (pos > stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
if (pos != stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3706,7 +3825,7 @@
if (size == 0) // 0 length tag, read another
continue;
- if (id == 0x27C8) { // SimpleTag ID
+ if (id == mkvmuxer::kMkvSimpleTag) {
status = ParseSimpleTag(pReader, pos, size);
if (status < 0)
@@ -3714,14 +3833,12 @@
}
pos += size;
- assert(pos <= stop);
if (pos > stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
if (pos != stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3799,12 +3916,12 @@
if (size == 0) // weird
continue;
- if (id == 0x5A3) { // TagName ID
+ if (id == mkvmuxer::kMkvTagName) {
status = UnserializeString(pReader, pos, size, m_tag_name);
if (status)
return status;
- } else if (id == 0x487) { // TagString ID
+ } else if (id == mkvmuxer::kMkvTagString) {
status = UnserializeString(pReader, pos, size, m_tag_string);
if (status)
@@ -3812,14 +3929,12 @@
}
pos += size;
- assert(pos <= stop);
if (pos > stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
if (pos != stop)
- return -1;
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -3866,12 +3981,12 @@
if (status < 0) // error
return status;
- if (id == 0x0AD7B1) { // Timecode Scale
+ if (id == mkvmuxer::kMkvTimecodeScale) {
m_timecodeScale = UnserializeUInt(pReader, pos, size);
if (m_timecodeScale <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0489) { // Segment duration
+ } else if (id == mkvmuxer::kMkvDuration) {
const long status = UnserializeFloat(pReader, pos, size, m_duration);
if (status < 0)
@@ -3879,19 +3994,19 @@
if (m_duration < 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0D80) { // MuxingApp
+ } else if (id == mkvmuxer::kMkvMuxingApp) {
const long status =
UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
if (status)
return status;
- } else if (id == 0x1741) { // WritingApp
+ } else if (id == mkvmuxer::kMkvWritingApp) {
const long status =
UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
if (status)
return status;
- } else if (id == 0x3BA9) { // Title
+ } else if (id == mkvmuxer::kMkvTitle) {
const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
if (status)
@@ -3899,10 +4014,17 @@
}
pos += size;
- assert(pos <= stop);
+
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ const double rollover_check = m_duration * m_timecodeScale;
+ if (rollover_check > LONG_LONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -4039,15 +4161,15 @@
if (status < 0) // error
return status;
- if (id == 0x7E8) {
- // AESSettingsCipherMode
+ if (id == mkvmuxer::kMkvAESSettingsCipherMode) {
aes->cipher_mode = UnserializeUInt(pReader, pos, size);
if (aes->cipher_mode != 1)
return E_FILE_FORMAT_INVALID;
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
return 0;
@@ -4070,14 +4192,15 @@
if (status < 0) // error
return status;
- if (id == 0x1034) // ContentCompression ID
+ if (id == mkvmuxer::kMkvContentCompression)
++compression_count;
- if (id == 0x1035) // ContentEncryption ID
+ if (id == mkvmuxer::kMkvContentEncryption)
++encryption_count;
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
if (compression_count <= 0 && encryption_count <= 0)
@@ -4108,19 +4231,15 @@
if (status < 0) // error
return status;
- if (id == 0x1031) {
- // ContentEncodingOrder
+ if (id == mkvmuxer::kMkvContentEncodingOrder) {
encoding_order_ = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x1032) {
- // ContentEncodingScope
+ } else if (id == mkvmuxer::kMkvContentEncodingScope) {
encoding_scope_ = UnserializeUInt(pReader, pos, size);
if (encoding_scope_ < 1)
return -1;
- } else if (id == 0x1033) {
- // ContentEncodingType
+ } else if (id == mkvmuxer::kMkvContentEncodingType) {
encoding_type_ = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x1034) {
- // ContentCompression ID
+ } else if (id == mkvmuxer::kMkvContentCompression) {
ContentCompression* const compression =
new (std::nothrow) ContentCompression();
if (!compression)
@@ -4132,8 +4251,7 @@
return status;
}
*compression_entries_end_++ = compression;
- } else if (id == 0x1035) {
- // ContentEncryption ID
+ } else if (id == mkvmuxer::kMkvContentEncryption) {
ContentEncryption* const encryption =
new (std::nothrow) ContentEncryption();
if (!encryption)
@@ -4148,10 +4266,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -4172,21 +4292,18 @@
if (status < 0) // error
return status;
- if (id == 0x254) {
- // ContentCompAlgo
+ if (id == mkvmuxer::kMkvContentCompAlgo) {
long long algo = UnserializeUInt(pReader, pos, size);
if (algo < 0)
return E_FILE_FORMAT_INVALID;
compression->algo = algo;
valid = true;
- } else if (id == 0x255) {
- // ContentCompSettings
+ } else if (id == mkvmuxer::kMkvContentCompSettings) {
if (size <= 0)
return E_FILE_FORMAT_INVALID;
const size_t buflen = static_cast<size_t>(size);
- typedef unsigned char* buf_t;
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -4202,7 +4319,8 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
// ContentCompAlgo is mandatory
@@ -4227,13 +4345,11 @@
if (status < 0) // error
return status;
- if (id == 0x7E1) {
- // ContentEncAlgo
+ if (id == mkvmuxer::kMkvContentEncAlgo) {
encryption->algo = UnserializeUInt(pReader, pos, size);
if (encryption->algo != 5)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x7E2) {
- // ContentEncKeyID
+ } else if (id == mkvmuxer::kMkvContentEncKeyID) {
delete[] encryption->key_id;
encryption->key_id = NULL;
encryption->key_id_len = 0;
@@ -4242,8 +4358,7 @@
return E_FILE_FORMAT_INVALID;
const size_t buflen = static_cast<size_t>(size);
- typedef unsigned char* buf_t;
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -4256,8 +4371,7 @@
encryption->key_id = buf;
encryption->key_id_len = buflen;
- } else if (id == 0x7E3) {
- // ContentSignature
+ } else if (id == mkvmuxer::kMkvContentSignature) {
delete[] encryption->signature;
encryption->signature = NULL;
encryption->signature_len = 0;
@@ -4266,8 +4380,7 @@
return E_FILE_FORMAT_INVALID;
const size_t buflen = static_cast<size_t>(size);
- typedef unsigned char* buf_t;
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -4280,8 +4393,7 @@
encryption->signature = buf;
encryption->signature_len = buflen;
- } else if (id == 0x7E4) {
- // ContentSigKeyID
+ } else if (id == mkvmuxer::kMkvContentSigKeyID) {
delete[] encryption->sig_key_id;
encryption->sig_key_id = NULL;
encryption->sig_key_id_len = 0;
@@ -4290,8 +4402,7 @@
return E_FILE_FORMAT_INVALID;
const size_t buflen = static_cast<size_t>(size);
- typedef unsigned char* buf_t;
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -4304,14 +4415,11 @@
encryption->sig_key_id = buf;
encryption->sig_key_id_len = buflen;
- } else if (id == 0x7E5) {
- // ContentSigAlgo
+ } else if (id == mkvmuxer::kMkvContentSigAlgo) {
encryption->sig_algo = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x7E6) {
- // ContentSigHashAlgo
+ } else if (id == mkvmuxer::kMkvContentSigHashAlgo) {
encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x7E7) {
- // ContentEncAESSettings
+ } else if (id == mkvmuxer::kMkvContentEncAESSettings) {
const long status = ParseContentEncAESSettingsEntry(
pos, size, pReader, &encryption->aes_settings);
if (status)
@@ -4319,7 +4427,8 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
return 0;
@@ -4418,7 +4527,7 @@
const size_t len = strlen(src);
- dst = new (std::nothrow) char[len + 1];
+ dst = SafeArrayAlloc<char>(1, len + 1);
if (dst == NULL)
return -1;
@@ -4469,7 +4578,7 @@
if (dst.codecPrivateSize != 0)
return -1;
- dst.codecPrivate = new (std::nothrow) unsigned char[codecPrivateSize];
+ dst.codecPrivate = SafeArrayAlloc<unsigned char>(1, codecPrivateSize);
if (dst.codecPrivate == NULL)
return -1;
@@ -4797,11 +4906,12 @@
return status;
// pos now designates start of element
- if (id == 0x2240) // ContentEncoding ID
+ if (id == mkvmuxer::kMkvContentEncoding)
++count;
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
if (count <= 0)
@@ -4821,7 +4931,7 @@
return status;
// pos now designates start of element
- if (id == 0x2240) { // ContentEncoding ID
+ if (id == mkvmuxer::kMkvContentEncoding) {
ContentEncoding* const content_encoding =
new (std::nothrow) ContentEncoding();
if (!content_encoding)
@@ -4837,10 +4947,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0;
}
@@ -4892,37 +5004,37 @@
if (status < 0) // error
return status;
- if (id == 0x30) { // pixel width
+ if (id == mkvmuxer::kMkvPixelWidth) {
width = UnserializeUInt(pReader, pos, size);
if (width <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x3A) { // pixel height
+ } else if (id == mkvmuxer::kMkvPixelHeight) {
height = UnserializeUInt(pReader, pos, size);
if (height <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x14B0) { // display width
+ } else if (id == mkvmuxer::kMkvDisplayWidth) {
display_width = UnserializeUInt(pReader, pos, size);
if (display_width <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x14BA) { // display height
+ } else if (id == mkvmuxer::kMkvDisplayHeight) {
display_height = UnserializeUInt(pReader, pos, size);
if (display_height <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x14B2) { // display unit
+ } else if (id == mkvmuxer::kMkvDisplayUnit) {
display_unit = UnserializeUInt(pReader, pos, size);
if (display_unit < 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x13B8) { // stereo mode
+ } else if (id == mkvmuxer::kMkvStereoMode) {
stereo_mode = UnserializeUInt(pReader, pos, size);
if (stereo_mode < 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x0383E3) { // frame rate
+ } else if (id == mkvmuxer::kMkvFrameRate) {
const long status = UnserializeFloat(pReader, pos, size, rate);
if (status < 0)
@@ -4933,10 +5045,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
VideoTrack* const pTrack =
new (std::nothrow) VideoTrack(pSegment, element_start, element_size);
@@ -5110,7 +5224,7 @@
if (status < 0) // error
return status;
- if (id == 0x35) { // Sample Rate
+ if (id == mkvmuxer::kMkvSamplingFrequency) {
status = UnserializeFloat(pReader, pos, size, rate);
if (status < 0)
@@ -5118,12 +5232,12 @@
if (rate <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x1F) { // Channel Count
+ } else if (id == mkvmuxer::kMkvChannels) {
channels = UnserializeUInt(pReader, pos, size);
if (channels <= 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x2264) { // Bit Depth
+ } else if (id == mkvmuxer::kMkvBitDepth) {
bit_depth = UnserializeUInt(pReader, pos, size);
if (bit_depth <= 0)
@@ -5131,10 +5245,12 @@
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
AudioTrack* const pTrack =
new (std::nothrow) AudioTrack(pSegment, element_start, element_size);
@@ -5194,14 +5310,16 @@
if (size == 0) // weird
continue;
- if (id == 0x2E) // TrackEntry ID
+ if (id == mkvmuxer::kMkvTrackEntry)
++count;
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
if (count <= 0)
return 0; // success
@@ -5234,13 +5352,12 @@
const long long element_size = payload_stop - element_start;
- if (id == 0x2E) { // TrackEntry ID
+ if (id == mkvmuxer::kMkvTrackEntry) {
Track*& pTrack = *m_trackEntriesEnd;
pTrack = NULL;
const long status = ParseTrackEntry(pos, payload_size, element_start,
element_size, pTrack);
-
if (status)
return status;
@@ -5249,10 +5366,12 @@
}
pos = payload_stop;
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
return 0; // success
}
@@ -5309,16 +5428,16 @@
const long long start = pos;
- if (id == 0x60) { // VideoSettings ID
+ if (id == mkvmuxer::kMkvVideo) {
v.start = start;
v.size = size;
- } else if (id == 0x61) { // AudioSettings ID
+ } else if (id == mkvmuxer::kMkvAudio) {
a.start = start;
a.size = size;
- } else if (id == 0x2D80) { // ContentEncodings ID
+ } else if (id == mkvmuxer::kMkvContentEncodings) {
e.start = start;
e.size = size;
- } else if (id == 0x33C5) { // Track UID
+ } else if (id == mkvmuxer::kMkvTrackUID) {
if (size > 8)
return E_FILE_FORMAT_INVALID;
@@ -5340,49 +5459,49 @@
++pos_;
}
- } else if (id == 0x57) { // Track Number
+ } else if (id == mkvmuxer::kMkvTrackNumber) {
const long long num = UnserializeUInt(pReader, pos, size);
if ((num <= 0) || (num > 127))
return E_FILE_FORMAT_INVALID;
info.number = static_cast<long>(num);
- } else if (id == 0x03) { // Track Type
+ } else if (id == mkvmuxer::kMkvTrackType) {
const long long type = UnserializeUInt(pReader, pos, size);
if ((type <= 0) || (type > 254))
return E_FILE_FORMAT_INVALID;
info.type = static_cast<long>(type);
- } else if (id == 0x136E) { // Track Name
+ } else if (id == mkvmuxer::kMkvName) {
const long status =
UnserializeString(pReader, pos, size, info.nameAsUTF8);
if (status)
return status;
- } else if (id == 0x02B59C) { // Track Language
+ } else if (id == mkvmuxer::kMkvLanguage) {
const long status = UnserializeString(pReader, pos, size, info.language);
if (status)
return status;
- } else if (id == 0x03E383) { // Default Duration
+ } else if (id == mkvmuxer::kMkvDefaultDuration) {
const long long duration = UnserializeUInt(pReader, pos, size);
if (duration < 0)
return E_FILE_FORMAT_INVALID;
info.defaultDuration = static_cast<unsigned long long>(duration);
- } else if (id == 0x06) { // CodecID
+ } else if (id == mkvmuxer::kMkvCodecID) {
const long status = UnserializeString(pReader, pos, size, info.codecId);
if (status)
return status;
- } else if (id == 0x1C) { // lacing
+ } else if (id == mkvmuxer::kMkvFlagLacing) {
lacing = UnserializeUInt(pReader, pos, size);
if ((lacing < 0) || (lacing > 1))
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x23A2) { // Codec Private
+ } else if (id == mkvmuxer::kMkvCodecPrivate) {
delete[] info.codecPrivate;
info.codecPrivate = NULL;
info.codecPrivateSize = 0;
@@ -5390,9 +5509,7 @@
const size_t buflen = static_cast<size_t>(size);
if (buflen) {
- typedef unsigned char* buf_t;
-
- const buf_t buf = new (std::nothrow) unsigned char[buflen];
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
if (buf == NULL)
return -1;
@@ -5407,23 +5524,25 @@
info.codecPrivate = buf;
info.codecPrivateSize = buflen;
}
- } else if (id == 0x058688) { // Codec Name
+ } else if (id == mkvmuxer::kMkvCodecName) {
const long status =
UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
if (status)
return status;
- } else if (id == 0x16AA) { // Codec Delay
+ } else if (id == mkvmuxer::kMkvCodecDelay) {
info.codecDelay = UnserializeUInt(pReader, pos, size);
- } else if (id == 0x16BB) { // Seek Pre Roll
+ } else if (id == mkvmuxer::kMkvSeekPreRoll) {
info.seekPreRoll = UnserializeUInt(pReader, pos, size);
}
pos += size; // consume payload
- assert(pos <= track_stop);
+ if (pos > track_stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == track_stop);
+ if (pos != track_stop)
+ return E_FILE_FORMAT_INVALID;
if (info.number <= 0) // not specified
return E_FILE_FORMAT_INVALID;
@@ -5552,98 +5671,88 @@
}
long Cluster::Load(long long& pos, long& len) const {
- assert(m_pSegment);
- assert(m_pos >= m_element_start);
+ if (m_pSegment == NULL)
+ return E_PARSE_FAILED;
if (m_timecode >= 0) // at least partially loaded
return 0;
- assert(m_pos == m_element_start);
- assert(m_element_size < 0);
+ if (m_pos != m_element_start || m_element_size >= 0)
+ return E_PARSE_FAILED;
IMkvReader* const pReader = m_pSegment->m_pReader;
-
long long total, avail;
-
const int status = pReader->Length(&total, &avail);
if (status < 0) // error
return status;
- assert((total < 0) || (avail <= total));
- assert((total < 0) || (m_pos <= total)); // TODO: verify this
+ if (total >= 0 && (avail > total || m_pos > total))
+ return E_FILE_FORMAT_INVALID;
pos = m_pos;
long long cluster_size = -1;
- {
- if ((pos + 1) > avail) {
- len = 1;
- return E_BUFFER_NOT_FULL;
- }
-
- long long result = GetUIntLength(pReader, pos, len);
-
- if (result < 0) // error or underflow
- return static_cast<long>(result);
-
- if (result > 0) // underflow (weird)
- return E_BUFFER_NOT_FULL;
-
- // if ((pos + len) > segment_stop)
- // return E_FILE_FORMAT_INVALID;
-
- if ((pos + len) > avail)
- return E_BUFFER_NOT_FULL;
-
- const long long id_ = ReadUInt(pReader, pos, len);
-
- if (id_ < 0) // error
- return static_cast<long>(id_);
-
- if (id_ != 0x0F43B675) // Cluster ID
- return E_FILE_FORMAT_INVALID;
-
- pos += len; // consume id
-
- // read cluster size
-
- if ((pos + 1) > avail) {
- len = 1;
- return E_BUFFER_NOT_FULL;
- }
-
- result = GetUIntLength(pReader, pos, len);
-
- if (result < 0) // error
- return static_cast<long>(result);
-
- if (result > 0) // weird
- return E_BUFFER_NOT_FULL;
-
- // if ((pos + len) > segment_stop)
- // return E_FILE_FORMAT_INVALID;
-
- if ((pos + len) > avail)
- return E_BUFFER_NOT_FULL;
-
- const long long size = ReadUInt(pReader, pos, len);
-
- if (size < 0) // error
- return static_cast<long>(cluster_size);
-
- if (size == 0)
- return E_FILE_FORMAT_INVALID; // TODO: verify this
-
- pos += len; // consume length of size of element
-
- const long long unknown_size = (1LL << (7 * len)) - 1;
-
- if (size != unknown_size)
- cluster_size = size;
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
}
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error or underflow
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id_ = ReadID(pReader, pos, len);
+
+ if (id_ < 0) // error
+ return static_cast<long>(id_);
+
+ if (id_ != mkvmuxer::kMkvCluster)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume id
+
+ // read cluster size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(cluster_size);
+
+ if (size == 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume length of size of element
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size != unknown_size)
+ cluster_size = size;
+
// pos points to start of payload
long long timecode = -1;
long long new_pos = -1;
@@ -5667,7 +5776,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5676,7 +5785,7 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0) // error
return static_cast<long>(id);
@@ -5688,10 +5797,10 @@
// that we have exhausted the sub-element's inside the cluster
// whose ID we parsed earlier.
- if (id == 0x0F43B675) // Cluster ID
+ if (id == mkvmuxer::kMkvCluster)
break;
- if (id == 0x0C53BB6B) // Cues ID
+ if (id == mkvmuxer::kMkvCues)
break;
pos += len; // consume ID field
@@ -5708,7 +5817,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5734,13 +5843,13 @@
// pos now points to start of payload
- if (size == 0) // weird
+ if (size == 0)
continue;
if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
return E_FILE_FORMAT_INVALID;
- if (id == 0x67) { // TimeCode ID
+ if (id == mkvmuxer::kMkvTimecode) {
len = static_cast<long>(size);
if ((pos + size) > avail)
@@ -5755,19 +5864,21 @@
if (bBlock)
break;
- } else if (id == 0x20) { // BlockGroup ID
+ } else if (id == mkvmuxer::kMkvBlockGroup) {
bBlock = true;
break;
- } else if (id == 0x23) { // SimpleBlock ID
+ } else if (id == mkvmuxer::kMkvSimpleBlock) {
bBlock = true;
break;
}
pos += size; // consume payload
- assert((cluster_stop < 0) || (pos <= cluster_stop));
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert((cluster_stop < 0) || (pos <= cluster_stop));
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
if (timecode < 0) // no timecode found
return E_FILE_FORMAT_INVALID;
@@ -5790,10 +5901,8 @@
if (status < 0)
return status;
- assert(m_pos >= m_element_start);
- assert(m_timecode >= 0);
- // assert(m_size > 0);
- // assert(m_element_size > m_size);
+ if (m_pos < m_element_start || m_timecode < 0)
+ return E_PARSE_FAILED;
const long long cluster_stop =
(m_element_size < 0) ? -1 : m_element_start + m_element_size;
@@ -5810,7 +5919,8 @@
if (status < 0) // error
return status;
- assert((total < 0) || (avail <= total));
+ if (total >= 0 && avail > total)
+ return E_FILE_FORMAT_INVALID;
pos = m_pos;
@@ -5837,7 +5947,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5846,19 +5956,16 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
- if (id < 0) // error
- return static_cast<long>(id);
-
- if (id == 0) // weird
+ if (id < 0)
return E_FILE_FORMAT_INVALID;
// This is the distinguished set of ID's we use to determine
// that we have exhausted the sub-element's inside the cluster
// whose ID we parsed earlier.
- if ((id == 0x0F43B675) || (id == 0x0C53BB6B)) { // Cluster or Cues ID
+ if ((id == mkvmuxer::kMkvCluster) || (id == mkvmuxer::kMkvCues)) {
if (m_element_size < 0)
m_element_size = pos - m_element_start;
@@ -5879,7 +5986,7 @@
if (result < 0) // error
return static_cast<long>(result);
- if (result > 0) // weird
+ if (result > 0)
return E_BUFFER_NOT_FULL;
if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5905,7 +6012,7 @@
// pos now points to start of payload
- if (size == 0) // weird
+ if (size == 0)
continue;
// const long long block_start = pos;
@@ -5913,8 +6020,10 @@
if (cluster_stop >= 0) {
if (block_stop > cluster_stop) {
- if ((id == 0x20) || (id == 0x23))
+ if (id == mkvmuxer::kMkvBlockGroup ||
+ id == mkvmuxer::kMkvSimpleBlock) {
return E_FILE_FORMAT_INVALID;
+ }
pos = cluster_stop;
break;
@@ -5930,42 +6039,48 @@
Cluster* const this_ = const_cast<Cluster*>(this);
- if (id == 0x20) // BlockGroup
+ if (id == mkvmuxer::kMkvBlockGroup)
return this_->ParseBlockGroup(size, pos, len);
- if (id == 0x23) // SimpleBlock
+ if (id == mkvmuxer::kMkvSimpleBlock)
return this_->ParseSimpleBlock(size, pos, len);
pos += size; // consume payload
- assert((cluster_stop < 0) || (pos <= cluster_stop));
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(m_element_size > 0);
+ if (m_element_size < 1)
+ return E_FILE_FORMAT_INVALID;
m_pos = pos;
- assert((cluster_stop < 0) || (m_pos <= cluster_stop));
+ if (cluster_stop >= 0 && m_pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
if (m_entries_count > 0) {
const long idx = m_entries_count - 1;
const BlockEntry* const pLast = m_entries[idx];
- assert(pLast);
+ if (pLast == NULL)
+ return E_PARSE_FAILED;
const Block* const pBlock = pLast->GetBlock();
- assert(pBlock);
+ if (pBlock == NULL)
+ return E_PARSE_FAILED;
const long long start = pBlock->m_start;
if ((total >= 0) && (start > total))
- return -1; // defend against trucated stream
+ return E_PARSE_FAILED; // defend against trucated stream
const long long size = pBlock->m_size;
const long long stop = start + size;
- assert((cluster_stop < 0) || (stop <= cluster_stop));
+ if (cluster_stop >= 0 && stop > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
if ((total >= 0) && (stop > total))
- return -1; // defend against trucated stream
+ return E_PARSE_FAILED; // defend against trucated stream
}
return 1; // no more entries
@@ -6058,7 +6173,7 @@
return E_BUFFER_NOT_FULL;
}
- status = CreateBlock(0x23, // simple block id
+ status = CreateBlock(mkvmuxer::kMkvSimpleBlock,
block_start, block_size,
0); // DiscardPadding
@@ -6118,12 +6233,12 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0) // error
return static_cast<long>(id);
- if (id == 0) // not a value ID
+ if (id == 0) // not a valid ID
return E_FILE_FORMAT_INVALID;
pos += len; // consume ID field
@@ -6169,14 +6284,14 @@
if (size == unknown_size)
return E_FILE_FORMAT_INVALID;
- if (id == 0x35A2) { // DiscardPadding
+ if (id == mkvmuxer::kMkvDiscardPadding) {
status = UnserializeInt(pReader, pos, size, discard_padding);
if (status < 0) // error
return status;
}
- if (id != 0x21) { // sub-part of BlockGroup is not a Block
+ if (id != mkvmuxer::kMkvBlock) {
pos += size; // consume sub-part of block group
if (pos > payload_stop)
@@ -6262,12 +6377,14 @@
}
pos = block_stop; // consume block-part of block group
- assert(pos <= payload_stop);
+ if (pos > payload_stop)
+ return E_FILE_FORMAT_INVALID;
}
- assert(pos == payload_stop);
+ if (pos != payload_stop)
+ return E_FILE_FORMAT_INVALID;
- status = CreateBlock(0x20, // BlockGroup ID
+ status = CreateBlock(mkvmuxer::kMkvBlockGroup,
payload_start, payload_size, discard_padding);
if (status != 0)
return status;
@@ -6310,17 +6427,14 @@
return E_BUFFER_NOT_FULL; // underflow, since more remains to be parsed
}
-Cluster* Cluster::Create(Segment* pSegment, long idx, long long off)
-// long long element_size)
-{
- assert(pSegment);
- assert(off >= 0);
+Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) {
+ if (!pSegment || off < 0)
+ return NULL;
const long long element_start = pSegment->m_start + off;
- Cluster* const pCluster = new Cluster(pSegment, idx, element_start);
- // element_size);
- assert(pCluster);
+ Cluster* const pCluster =
+ new (std::nothrow) Cluster(pSegment, idx, element_start);
return pCluster;
}
@@ -6431,13 +6545,13 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0) // error
return static_cast<long>(id);
- if (id != 0x0F43B675) // weird: not cluster ID
- return -1; // generic error
+ if (id != mkvmuxer::kMkvCluster)
+ return E_PARSE_FAILED;
pos += len; // consume Cluster ID field
@@ -6515,7 +6629,7 @@
if ((pos + len) > avail)
return E_BUFFER_NOT_FULL;
- const long long id = ReadUInt(pReader, pos, len);
+ const long long id = ReadID(pReader, pos, len);
if (id < 0) // error
return static_cast<long>(id);
@@ -6524,10 +6638,10 @@
// that we have exhausted the sub-element's inside the cluster
// whose ID we parsed earlier.
- if (id == 0x0F43B675) // Cluster ID
+ if (id == mkvmuxer::kMkvCluster)
return 0; // no entries found
- if (id == 0x0C53BB6B) // Cues ID
+ if (id == mkvmuxer::kMkvCues)
return 0; // no entries found
pos += len; // consume id field
@@ -6579,14 +6693,15 @@
if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
return E_FILE_FORMAT_INVALID;
- if (id == 0x20) // BlockGroup ID
+ if (id == mkvmuxer::kMkvBlockGroup)
return 1; // have at least one entry
- if (id == 0x23) // SimpleBlock ID
+ if (id == mkvmuxer::kMkvSimpleBlock)
return 1; // have at least one entry
pos += size; // consume payload
- assert((cluster_stop < 0) || (pos <= cluster_stop));
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
}
}
@@ -6656,14 +6771,17 @@
long Cluster::CreateBlock(long long id,
long long pos, // absolute pos of payload
long long size, long long discard_padding) {
- assert((id == 0x20) || (id == 0x23)); // BlockGroup or SimpleBlock
+ if (id != mkvmuxer::kMkvBlockGroup && id != mkvmuxer::kMkvSimpleBlock)
+ return E_PARSE_FAILED;
if (m_entries_count < 0) { // haven't parsed anything yet
assert(m_entries == NULL);
assert(m_entries_size == 0);
m_entries_size = 1024;
- m_entries = new BlockEntry*[m_entries_size];
+ m_entries = new (std::nothrow) BlockEntry*[m_entries_size];
+ if (m_entries == NULL)
+ return -1;
m_entries_count = 0;
} else {
@@ -6674,8 +6792,9 @@
if (m_entries_count >= m_entries_size) {
const long entries_size = 2 * m_entries_size;
- BlockEntry** const entries = new BlockEntry*[entries_size];
- assert(entries);
+ BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size];
+ if (entries == NULL)
+ return -1;
BlockEntry** src = m_entries;
BlockEntry** const src_end = src + m_entries_count;
@@ -6692,9 +6811,9 @@
}
}
- if (id == 0x20) // BlockGroup ID
+ if (id == mkvmuxer::kMkvBlockGroup)
return CreateBlockGroup(pos, size, discard_padding);
- else // SimpleBlock ID
+ else
return CreateSimpleBlock(pos, size);
}
@@ -6725,9 +6844,9 @@
while (pos < stop) {
long len;
- const long long id = ReadUInt(pReader, pos, len);
- assert(id >= 0); // TODO
- assert((pos + len) <= stop);
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (pos + len) > stop)
+ return E_FILE_FORMAT_INVALID;
pos += len; // consume ID
@@ -6737,12 +6856,12 @@
pos += len; // consume size
- if (id == 0x21) { // Block ID
+ if (id == mkvmuxer::kMkvBlock) {
if (bpos < 0) { // Block ID
bpos = pos;
bsize = size;
}
- } else if (id == 0x1B) { // Duration ID
+ } else if (id == mkvmuxer::kMkvBlockDuration) {
if (size > 8)
return E_FILE_FORMAT_INVALID;
@@ -6750,7 +6869,7 @@
if (duration < 0)
return E_FILE_FORMAT_INVALID;
- } else if (id == 0x7B) { // ReferenceBlock
+ } else if (id == mkvmuxer::kMkvReferenceBlock) {
if (size > 8 || size <= 0)
return E_FILE_FORMAT_INVALID;
const long size_ = static_cast<long>(size);
@@ -6764,17 +6883,19 @@
if (time <= 0) // see note above
prev = time;
- else // weird
+ else
next = time;
}
pos += size; // consume payload
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
}
if (bpos < 0)
return E_FILE_FORMAT_INVALID;
- assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
assert(bsize >= 0);
const long idx = m_entries_count;
@@ -7213,7 +7334,9 @@
return E_FILE_FORMAT_INVALID;
m_frame_count = 1;
- m_frames = new Frame[m_frame_count];
+ m_frames = new (std::nothrow) Frame[m_frame_count];
+ if (m_frames == NULL)
+ return -1;
Frame& f = m_frames[0];
f.pos = pos;
@@ -7239,18 +7362,23 @@
return E_FILE_FORMAT_INVALID;
++pos; // consume frame count
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
m_frame_count = int(biased_count) + 1;
- m_frames = new Frame[m_frame_count];
- assert(m_frames);
+ m_frames = new (std::nothrow) Frame[m_frame_count];
+ if (m_frames == NULL)
+ return -1;
+
+ if (!m_frames)
+ return E_FILE_FORMAT_INVALID;
if (lacing == 1) { // Xiph
Frame* pf = m_frames;
Frame* const pf_end = pf + m_frame_count;
- long size = 0;
+ long long size = 0;
int frame_count = m_frame_count;
while (frame_count > 1) {
@@ -7277,6 +7405,8 @@
Frame& f = *pf++;
assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
f.pos = 0; // patch later
@@ -7289,8 +7419,8 @@
--frame_count;
}
- assert(pf < pf_end);
- assert(pos <= stop);
+ if (pf >= pf_end || pos > stop)
+ return E_FILE_FORMAT_INVALID;
{
Frame& f = *pf++;
@@ -7318,11 +7448,17 @@
Frame& f = *pf++;
assert((pos + f.len) <= stop);
+ if ((pos + f.len) > stop)
+ return E_FILE_FORMAT_INVALID;
+
f.pos = pos;
pos += f.len;
}
assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
} else if (lacing == 2) { // fixed-size lacing
if (pos >= stop)
return E_FILE_FORMAT_INVALID;
@@ -7342,6 +7478,8 @@
while (pf != pf_end) {
assert((pos + frame_size) <= stop);
+ if ((pos + frame_size) > stop)
+ return E_FILE_FORMAT_INVALID;
Frame& f = *pf++;
@@ -7352,13 +7490,16 @@
}
assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
} else {
assert(lacing == 3); // EBML lacing
if (pos >= stop)
return E_FILE_FORMAT_INVALID;
- long size = 0;
+ long long size = 0;
int frame_count = m_frame_count;
long long frame_size = ReadUInt(pReader, pos, len);
@@ -7396,6 +7537,9 @@
return E_FILE_FORMAT_INVALID;
assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
+
const Frame& prev = *pf++;
assert(prev.len == frame_size);
@@ -7403,6 +7547,8 @@
return E_FILE_FORMAT_INVALID;
assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
Frame& curr = *pf;
@@ -7417,7 +7563,8 @@
return E_FILE_FORMAT_INVALID;
pos += len; // consume length of (delta) size
- assert(pos <= stop);
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
const int exp = 7 * len - 1;
const long long bias = (1LL << exp) - 1LL;
@@ -7439,18 +7586,20 @@
// parse last frame
if (frame_count > 0) {
- assert(pos <= stop);
- assert(pf < pf_end);
+ if (pos > stop || pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
const Frame& prev = *pf++;
assert(prev.len == frame_size);
if (prev.len != frame_size)
return E_FILE_FORMAT_INVALID;
- assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
Frame& curr = *pf++;
- assert(pf == pf_end);
+ if (pf != pf_end)
+ return E_FILE_FORMAT_INVALID;
curr.pos = 0; // patch later
@@ -7471,6 +7620,8 @@
while (pf != pf_end) {
Frame& f = *pf++;
assert((pos + f.len) <= stop);
+ if ((pos + f.len) > stop)
+ return E_FILE_FORMAT_INVALID;
f.pos = pos;
pos += f.len;
diff --git a/third_party/libwebm/mkvparser.hpp b/third_party/libwebm/mkvparser.hpp
index aa0b432..75ef69d 100644
--- a/third_party/libwebm/mkvparser.hpp
+++ b/third_party/libwebm/mkvparser.hpp
@@ -9,12 +9,13 @@
#ifndef MKVPARSER_HPP
#define MKVPARSER_HPP
-#include <cstdlib>
-#include <cstdio>
#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
namespace mkvparser {
+const int E_PARSE_FAILED = -1;
const int E_FILE_FORMAT_INVALID = -2;
const int E_BUFFER_NOT_FULL = -3;
@@ -27,8 +28,11 @@
virtual ~IMkvReader();
};
+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+ unsigned long long element_size);
long long GetUIntLength(IMkvReader*, long long, long&);
long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
long long UnserializeUInt(IMkvReader*, long long pos, long long size);
long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
@@ -833,7 +837,7 @@
private:
bool Init() const;
- void PreloadCuePoint(long&, long long) const;
+ bool PreloadCuePoint(long&, long long) const;
mutable CuePoint** m_cue_points;
mutable long m_count;
@@ -999,8 +1003,8 @@
long DoLoadClusterUnknownSize(long long&, long&);
long DoParseNext(const Cluster*&, long long&, long&);
- void AppendCluster(Cluster*);
- void PreloadCluster(Cluster*, ptrdiff_t);
+ bool AppendCluster(Cluster*);
+ bool PreloadCluster(Cluster*, ptrdiff_t);
// void ParseSeekHead(long long pos, long long size);
// void ParseSeekEntry(long long pos, long long size);
diff --git a/third_party/libwebm/webmids.hpp b/third_party/libwebm/webmids.hpp
index 6874e44..ad4ab57 100644
--- a/third_party/libwebm/webmids.hpp
+++ b/third_party/libwebm/webmids.hpp
@@ -41,6 +41,7 @@
kMkvTimecodeScale = 0x2AD7B1,
kMkvDuration = 0x4489,
kMkvDateUTC = 0x4461,
+ kMkvTitle = 0x7BA9,
kMkvMuxingApp = 0x4D80,
kMkvWritingApp = 0x5741,
// Cluster
@@ -107,9 +108,16 @@
kMkvContentEncodingOrder = 0x5031,
kMkvContentEncodingScope = 0x5032,
kMkvContentEncodingType = 0x5033,
+ kMkvContentCompression = 0x5034,
+ kMkvContentCompAlgo = 0x4254,
+ kMkvContentCompSettings = 0x4255,
kMkvContentEncryption = 0x5035,
kMkvContentEncAlgo = 0x47E1,
kMkvContentEncKeyID = 0x47E2,
+ kMkvContentSignature = 0x47E3,
+ kMkvContentSigKeyID = 0x47E4,
+ kMkvContentSigAlgo = 0x47E5,
+ kMkvContentSigHashAlgo = 0x47E6,
kMkvContentEncAESSettings = 0x47E7,
kMkvAESSettingsCipherMode = 0x47E8,
kMkvAESSettingsCipherInitData = 0x47E9,
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index ad849af..7217f6d 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -695,6 +695,13 @@
1023,
};
+const scan_order vp10_default_scan_orders[TX_SIZES] = {
+ {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+ {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+ {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
+ {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+};
+
const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES] = {
{ // TX_4X4
{default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
diff --git a/vp10/common/scan.h b/vp10/common/scan.h
index ada67f4..f5a020f 100644
--- a/vp10/common/scan.h
+++ b/vp10/common/scan.h
@@ -29,6 +29,7 @@
const int16_t *neighbors;
} scan_order;
+extern const scan_order vp10_default_scan_orders[TX_SIZES];
extern const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES];
static INLINE int get_coef_context(const int16_t *neighbors,
diff --git a/vp10/common/vp10_inv_txfm.c b/vp10/common/vp10_inv_txfm.c
new file mode 100644
index 0000000..403b209
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm.c
@@ -0,0 +1,2499 @@
+/*
+ *
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "vp10/common/vp10_inv_txfm.h"
+
+void vp10_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WRAPLOW(a1, 8);
+ op[1] = WRAPLOW(b1, 8);
+ op[2] = WRAPLOW(c1, 8);
+ op[3] = WRAPLOW(d1, 8);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+ dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
+ dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
+ dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+
+ ip++;
+ dest++;
+ }
+}
+
+void vp10_iwht4x4_1_add_c(const tran_low_t *in,
+ uint8_t *dest,
+ int dest_stride) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WRAPLOW(a1, 8);
+ op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
+ dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
+ dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
+ dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+ ip++;
+ dest++;
+ }
+}
+
+void vp10_idct4_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ // stage 1
+ temp1 = (input[0] + input[2]) * cospi_16_64;
+ temp2 = (input[0] - input[2]) * cospi_16_64;
+ step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ // stage 2
+ output[0] = WRAPLOW(step[0] + step[3], 8);
+ output[1] = WRAPLOW(step[1] + step[2], 8);
+ output[2] = WRAPLOW(step[1] - step[2], 8);
+ output[3] = WRAPLOW(step[0] - step[3], 8);
+}
+
+void vp10_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ vp10_idct4_c(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ vp10_idct4_c(temp_in, temp_out);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 4));
+ }
+ }
+}
+
+void vp10_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+ int dest_stride) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = clip_pixel_add(dest[0], a1);
+ dest[1] = clip_pixel_add(dest[1], a1);
+ dest[2] = clip_pixel_add(dest[2], a1);
+ dest[3] = clip_pixel_add(dest[3], a1);
+ dest += dest_stride;
+ }
+}
+
+void vp10_idct8_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ // stage 2
+ temp1 = (step1[0] + step1[2]) * cospi_16_64;
+ temp2 = (step1[0] - step1[2]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+ step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+ step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+ // stage 3
+ step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+ step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+ step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+ step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WRAPLOW(step1[0] + step1[7], 8);
+ output[1] = WRAPLOW(step1[1] + step1[6], 8);
+ output[2] = WRAPLOW(step1[2] + step1[5], 8);
+ output[3] = WRAPLOW(step1[3] + step1[4], 8);
+ output[4] = WRAPLOW(step1[3] - step1[4], 8);
+ output[5] = WRAPLOW(step1[2] - step1[5], 8);
+ output[6] = WRAPLOW(step1[1] - step1[6], 8);
+ output[7] = WRAPLOW(step1[0] - step1[7], 8);
+}
+
+void vp10_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ for (i = 0; i < 8; ++i) {
+ vp10_idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
+void vp10_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i)
+ dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
+ output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
+ output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
+ output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+}
+
+void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
+ s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
+ s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+ s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+ s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+ s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+ s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
+ s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
+ x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
+ x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+
+ // stage 2
+ s0 = (int)x0;
+ s1 = (int)x1;
+ s2 = (int)x2;
+ s3 = (int)x3;
+ s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+ s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+ s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+ s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+
+ x0 = WRAPLOW(s0 + s2, 8);
+ x1 = WRAPLOW(s1 + s3, 8);
+ x2 = WRAPLOW(s0 - s2, 8);
+ x3 = WRAPLOW(s1 - s3, 8);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+
+ // stage 3
+ s2 = (int)(cospi_16_64 * (x2 + x3));
+ s3 = (int)(cospi_16_64 * (x2 - x3));
+ s6 = (int)(cospi_16_64 * (x6 + x7));
+ s7 = (int)(cospi_16_64 * (x6 - x7));
+
+ x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+ x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+
+ output[0] = WRAPLOW(x0, 8);
+ output[1] = WRAPLOW(-x4, 8);
+ output[2] = WRAPLOW(x6, 8);
+ output[3] = WRAPLOW(-x2, 8);
+ output[4] = WRAPLOW(x3, 8);
+ output[5] = WRAPLOW(-x7, 8);
+ output[6] = WRAPLOW(x5, 8);
+ output[7] = WRAPLOW(-x1, 8);
+}
+
+void vp10_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ // only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i) {
+ vp10_idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
+void vp10_idct16_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = input[0/2];
+ step1[1] = input[16/2];
+ step1[2] = input[8/2];
+ step1[3] = input[24/2];
+ step1[4] = input[4/2];
+ step1[5] = input[20/2];
+ step1[6] = input[12/2];
+ step1[7] = input[28/2];
+ step1[8] = input[2/2];
+ step1[9] = input[18/2];
+ step1[10] = input[10/2];
+ step1[11] = input[26/2];
+ step1[12] = input[6/2];
+ step1[13] = input[22/2];
+ step1[14] = input[14/2];
+ step1[15] = input[30/2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+ step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+ step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+ step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+ step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+ step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+ step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+ step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+ step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+ step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+ step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+ step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+ step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+ step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+ step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+ step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+ step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+ step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+ step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+ step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+ step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+ step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+ step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+ step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+ step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+ step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+ step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = WRAPLOW(step2[0] + step2[15], 8);
+ output[1] = WRAPLOW(step2[1] + step2[14], 8);
+ output[2] = WRAPLOW(step2[2] + step2[13], 8);
+ output[3] = WRAPLOW(step2[3] + step2[12], 8);
+ output[4] = WRAPLOW(step2[4] + step2[11], 8);
+ output[5] = WRAPLOW(step2[5] + step2[10], 8);
+ output[6] = WRAPLOW(step2[6] + step2[9], 8);
+ output[7] = WRAPLOW(step2[7] + step2[8], 8);
+ output[8] = WRAPLOW(step2[7] - step2[8], 8);
+ output[9] = WRAPLOW(step2[6] - step2[9], 8);
+ output[10] = WRAPLOW(step2[5] - step2[10], 8);
+ output[11] = WRAPLOW(step2[4] - step2[11], 8);
+ output[12] = WRAPLOW(step2[3] - step2[12], 8);
+ output[13] = WRAPLOW(step2[2] - step2[13], 8);
+ output[14] = WRAPLOW(step2[1] - step2[14], 8);
+ output[15] = WRAPLOW(step2[0] - step2[15], 8);
+}
+
+void vp10_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows
+ for (i = 0; i < 16; ++i) {
+ vp10_idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp10_idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = output[8]
+ = output[9] = output[10] = output[11] = output[12]
+ = output[13] = output[14] = output[15] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
+ x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
+ x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
+ x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
+ x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
+ x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
+ x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
+ x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
+ x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4, 8);
+ x1 = WRAPLOW(s1 + s5, 8);
+ x2 = WRAPLOW(s2 + s6, 8);
+ x3 = WRAPLOW(s3 + s7, 8);
+ x4 = WRAPLOW(s0 - s4, 8);
+ x5 = WRAPLOW(s1 - s5, 8);
+ x6 = WRAPLOW(s2 - s6, 8);
+ x7 = WRAPLOW(s3 - s7, 8);
+ x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
+ x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
+ x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
+ x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
+ x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
+ x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
+ x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
+ x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = WRAPLOW(check_range(s0 + s2), 8);
+ x1 = WRAPLOW(check_range(s1 + s3), 8);
+ x2 = WRAPLOW(check_range(s0 - s2), 8);
+ x3 = WRAPLOW(check_range(s1 - s3), 8);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+ x8 = WRAPLOW(check_range(s8 + s10), 8);
+ x9 = WRAPLOW(check_range(s9 + s11), 8);
+ x10 = WRAPLOW(check_range(s8 - s10), 8);
+ x11 = WRAPLOW(check_range(s9 - s11), 8);
+ x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
+ x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
+ x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
+ x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (- x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (- x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+ x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+ x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+ x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+ x10 = WRAPLOW(dct_const_round_shift(s10), 8);
+ x11 = WRAPLOW(dct_const_round_shift(s11), 8);
+ x14 = WRAPLOW(dct_const_round_shift(s14), 8);
+ x15 = WRAPLOW(dct_const_round_shift(s15), 8);
+
+ output[0] = WRAPLOW(x0, 8);
+ output[1] = WRAPLOW(-x8, 8);
+ output[2] = WRAPLOW(x12, 8);
+ output[3] = WRAPLOW(-x4, 8);
+ output[4] = WRAPLOW(x6, 8);
+ output[5] = WRAPLOW(x14, 8);
+ output[6] = WRAPLOW(x10, 8);
+ output[7] = WRAPLOW(x2, 8);
+ output[8] = WRAPLOW(x3, 8);
+ output[9] = WRAPLOW(x11, 8);
+ output[10] = WRAPLOW(x15, 8);
+ output[11] = WRAPLOW(x7, 8);
+ output[12] = WRAPLOW(x5, 8);
+ output[13] = WRAPLOW(-x13, 8);
+ output[14] = WRAPLOW(x9, 8);
+ output[15] = WRAPLOW(-x1, 8);
+}
+
+void vp10_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ vp10_idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ vp10_idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(
+ dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vp10_idct16x16_1_add_c(const tran_low_t *input,
+ uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+ temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+ temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+ temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+ temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+ temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+ temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+ temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+ temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ step2[16] = WRAPLOW(step1[16] + step1[17], 8);
+ step2[17] = WRAPLOW(step1[16] - step1[17], 8);
+ step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
+ step2[19] = WRAPLOW(step1[18] + step1[19], 8);
+ step2[20] = WRAPLOW(step1[20] + step1[21], 8);
+ step2[21] = WRAPLOW(step1[20] - step1[21], 8);
+ step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
+ step2[23] = WRAPLOW(step1[22] + step1[23], 8);
+ step2[24] = WRAPLOW(step1[24] + step1[25], 8);
+ step2[25] = WRAPLOW(step1[24] - step1[25], 8);
+ step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
+ step2[27] = WRAPLOW(step1[26] + step1[27], 8);
+ step2[28] = WRAPLOW(step1[28] + step1[29], 8);
+ step2[29] = WRAPLOW(step1[28] - step1[29], 8);
+ step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
+ step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+ step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+ step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+ step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+ step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+ step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+ step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+ step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+ step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+ temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+ temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+ temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+ temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+ step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+ step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = WRAPLOW(step1[16] + step1[19], 8);
+ step2[17] = WRAPLOW(step1[17] + step1[18], 8);
+ step2[18] = WRAPLOW(step1[17] - step1[18], 8);
+ step2[19] = WRAPLOW(step1[16] - step1[19], 8);
+ step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
+ step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
+ step2[22] = WRAPLOW(step1[21] + step1[22], 8);
+ step2[23] = WRAPLOW(step1[20] + step1[23], 8);
+
+ step2[24] = WRAPLOW(step1[24] + step1[27], 8);
+ step2[25] = WRAPLOW(step1[25] + step1[26], 8);
+ step2[26] = WRAPLOW(step1[25] - step1[26], 8);
+ step2[27] = WRAPLOW(step1[24] - step1[27], 8);
+ step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
+ step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
+ step2[30] = WRAPLOW(step1[29] + step1[30], 8);
+ step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+ step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+ step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+ step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+ step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+ step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+ step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+ step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+ step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+ step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+ step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+ temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+ temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+ temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+ temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+ step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+ step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+ step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+ step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+ step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+ step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+ step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = WRAPLOW(step1[16] + step1[23], 8);
+ step2[17] = WRAPLOW(step1[17] + step1[22], 8);
+ step2[18] = WRAPLOW(step1[18] + step1[21], 8);
+ step2[19] = WRAPLOW(step1[19] + step1[20], 8);
+ step2[20] = WRAPLOW(step1[19] - step1[20], 8);
+ step2[21] = WRAPLOW(step1[18] - step1[21], 8);
+ step2[22] = WRAPLOW(step1[17] - step1[22], 8);
+ step2[23] = WRAPLOW(step1[16] - step1[23], 8);
+
+ step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
+ step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
+ step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
+ step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
+ step2[28] = WRAPLOW(step1[27] + step1[28], 8);
+ step2[29] = WRAPLOW(step1[26] + step1[29], 8);
+ step2[30] = WRAPLOW(step1[25] + step1[30], 8);
+ step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+
+ // stage 7
+ step1[0] = WRAPLOW(step2[0] + step2[15], 8);
+ step1[1] = WRAPLOW(step2[1] + step2[14], 8);
+ step1[2] = WRAPLOW(step2[2] + step2[13], 8);
+ step1[3] = WRAPLOW(step2[3] + step2[12], 8);
+ step1[4] = WRAPLOW(step2[4] + step2[11], 8);
+ step1[5] = WRAPLOW(step2[5] + step2[10], 8);
+ step1[6] = WRAPLOW(step2[6] + step2[9], 8);
+ step1[7] = WRAPLOW(step2[7] + step2[8], 8);
+ step1[8] = WRAPLOW(step2[7] - step2[8], 8);
+ step1[9] = WRAPLOW(step2[6] - step2[9], 8);
+ step1[10] = WRAPLOW(step2[5] - step2[10], 8);
+ step1[11] = WRAPLOW(step2[4] - step2[11], 8);
+ step1[12] = WRAPLOW(step2[3] - step2[12], 8);
+ step1[13] = WRAPLOW(step2[2] - step2[13], 8);
+ step1[14] = WRAPLOW(step2[1] - step2[14], 8);
+ step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WRAPLOW(step1[0] + step1[31], 8);
+ output[1] = WRAPLOW(step1[1] + step1[30], 8);
+ output[2] = WRAPLOW(step1[2] + step1[29], 8);
+ output[3] = WRAPLOW(step1[3] + step1[28], 8);
+ output[4] = WRAPLOW(step1[4] + step1[27], 8);
+ output[5] = WRAPLOW(step1[5] + step1[26], 8);
+ output[6] = WRAPLOW(step1[6] + step1[25], 8);
+ output[7] = WRAPLOW(step1[7] + step1[24], 8);
+ output[8] = WRAPLOW(step1[8] + step1[23], 8);
+ output[9] = WRAPLOW(step1[9] + step1[22], 8);
+ output[10] = WRAPLOW(step1[10] + step1[21], 8);
+ output[11] = WRAPLOW(step1[11] + step1[20], 8);
+ output[12] = WRAPLOW(step1[12] + step1[19], 8);
+ output[13] = WRAPLOW(step1[13] + step1[18], 8);
+ output[14] = WRAPLOW(step1[14] + step1[17], 8);
+ output[15] = WRAPLOW(step1[15] + step1[16], 8);
+ output[16] = WRAPLOW(step1[15] - step1[16], 8);
+ output[17] = WRAPLOW(step1[14] - step1[17], 8);
+ output[18] = WRAPLOW(step1[13] - step1[18], 8);
+ output[19] = WRAPLOW(step1[12] - step1[19], 8);
+ output[20] = WRAPLOW(step1[11] - step1[20], 8);
+ output[21] = WRAPLOW(step1[10] - step1[21], 8);
+ output[22] = WRAPLOW(step1[9] - step1[22], 8);
+ output[23] = WRAPLOW(step1[8] - step1[23], 8);
+ output[24] = WRAPLOW(step1[7] - step1[24], 8);
+ output[25] = WRAPLOW(step1[6] - step1[25], 8);
+ output[26] = WRAPLOW(step1[5] - step1[26], 8);
+ output[27] = WRAPLOW(step1[4] - step1[27], 8);
+ output[28] = WRAPLOW(step1[3] - step1[28], 8);
+ output[29] = WRAPLOW(step1[2] - step1[29], 8);
+ output[30] = WRAPLOW(step1[1] - step1[30], 8);
+ output[31] = WRAPLOW(step1[0] - step1[31], 8);
+}
+
+void vp10_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ int16_t zero_coeff[16];
+ for (j = 0; j < 16; ++j)
+ zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ vp10_idct32_c(input, outptr);
+ else
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ vp10_idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vp10_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32] = {0};
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ // only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i) {
+ vp10_idct32_c(input, outptr);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ vp10_idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(
+ dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vp10_idct32x32_1_add_c(const tran_low_t *input,
+ uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_high_t a1;
+
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i)
+ dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WRAPLOW(a1, bd);
+ op[1] = WRAPLOW(b1, bd);
+ op[2] = WRAPLOW(c1, bd);
+ op[3] = WRAPLOW(d1, bd);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+ dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+ dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+ dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+ ip++;
+ dest++;
+ }
+}
+
+void vp10_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ (void) bd;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WRAPLOW(a1, bd);
+ op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] = highbd_clip_pixel_add(
+ dest[dest_stride * 0], a1, bd);
+ dest[dest_stride * 1] = highbd_clip_pixel_add(
+ dest[dest_stride * 1], e1, bd);
+ dest[dest_stride * 2] = highbd_clip_pixel_add(
+ dest[dest_stride * 2], e1, bd);
+ dest[dest_stride * 3] = highbd_clip_pixel_add(
+ dest[dest_stride * 3], e1, bd);
+ ip++;
+ dest++;
+ }
+}
+
+void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ (void) bd;
+ // stage 1
+ temp1 = (input[0] + input[2]) * cospi_16_64;
+ temp2 = (input[0] - input[2]) * cospi_16_64;
+ step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ // stage 2
+ output[0] = WRAPLOW(step[0] + step[3], bd);
+ output[1] = WRAPLOW(step[1] + step[2], bd);
+ output[2] = WRAPLOW(step[1] - step[2], bd);
+ output[3] = WRAPLOW(step[0] - step[3], bd);
+}
+
+void vp10_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct4_c(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ vp10_highbd_idct4_c(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+ dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+ dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+ dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
+ dest += dest_stride;
+ }
+}
+
+void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ // stage 2 & stage 3 - even half
+ vp10_highbd_idct4_c(step1, step1, bd);
+
+ // stage 2 - odd half
+ step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+ // stage 3 - odd half
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WRAPLOW(step1[0] + step1[7], bd);
+ output[1] = WRAPLOW(step1[1] + step1[6], bd);
+ output[2] = WRAPLOW(step1[2] + step1[5], bd);
+ output[3] = WRAPLOW(step1[3] + step1[4], bd);
+ output[4] = WRAPLOW(step1[3] - step1[4], bd);
+ output[5] = WRAPLOW(step1[2] - step1[5], bd);
+ output[6] = WRAPLOW(step1[1] - step1[6], bd);
+ output[7] = WRAPLOW(step1[0] - step1[7], bd);
+}
+
+void vp10_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ for (i = 0; i < 8; ++i) {
+ vp10_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i)
+ dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3)) {
+ memset(output, 0, 4 * sizeof(*output));
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = (tran_high_t)(x0 - x2 + x3);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+ output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+ output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+}
+
+void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_low_t x0 = input[7];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[5];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[3];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[1];
+ tran_low_t x7 = input[6];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ memset(output, 0, 8 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = WRAPLOW(s0 + s2, bd);
+ x1 = WRAPLOW(s1 + s3, bd);
+ x2 = WRAPLOW(s0 - s2, bd);
+ x3 = WRAPLOW(s1 - s3, bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+
+ output[0] = WRAPLOW(x0, bd);
+ output[1] = WRAPLOW(-x4, bd);
+ output[2] = WRAPLOW(x6, bd);
+ output[3] = WRAPLOW(-x2, bd);
+ output[4] = WRAPLOW(x3, bd);
+ output[5] = WRAPLOW(-x7, bd);
+ output[6] = WRAPLOW(x5, bd);
+ output[7] = WRAPLOW(-x1, bd);
+}
+
+void vp10_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ // Only first 4 row has non-zero coefs.
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ // Then transform columns.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+ (void) bd;
+
+ // stage 1
+ step1[0] = input[0/2];
+ step1[1] = input[16/2];
+ step1[2] = input[8/2];
+ step1[3] = input[24/2];
+ step1[4] = input[4/2];
+ step1[5] = input[20/2];
+ step1[6] = input[12/2];
+ step1[7] = input[28/2];
+ step1[8] = input[2/2];
+ step1[9] = input[18/2];
+ step1[10] = input[10/2];
+ step1[11] = input[26/2];
+ step1[12] = input[6/2];
+ step1[13] = input[22/2];
+ step1[14] = input[14/2];
+ step1[15] = input[30/2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+ step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+ step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+ step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+ step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+ step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+ step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+ step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+ step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+ step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+ step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+ step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+ step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+ step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+ step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+ step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+ step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+ step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+ step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+ step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+ step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+ step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+ step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+ step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+ step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = WRAPLOW(step2[0] + step2[15], bd);
+ output[1] = WRAPLOW(step2[1] + step2[14], bd);
+ output[2] = WRAPLOW(step2[2] + step2[13], bd);
+ output[3] = WRAPLOW(step2[3] + step2[12], bd);
+ output[4] = WRAPLOW(step2[4] + step2[11], bd);
+ output[5] = WRAPLOW(step2[5] + step2[10], bd);
+ output[6] = WRAPLOW(step2[6] + step2[9], bd);
+ output[7] = WRAPLOW(step2[7] + step2[8], bd);
+ output[8] = WRAPLOW(step2[7] - step2[8], bd);
+ output[9] = WRAPLOW(step2[6] - step2[9], bd);
+ output[10] = WRAPLOW(step2[5] - step2[10], bd);
+ output[11] = WRAPLOW(step2[4] - step2[11], bd);
+ output[12] = WRAPLOW(step2[3] - step2[12], bd);
+ output[13] = WRAPLOW(step2[2] - step2[13], bd);
+ output[14] = WRAPLOW(step2[1] - step2[14], bd);
+ output[15] = WRAPLOW(step2[0] - step2[15], bd);
+}
+
+void vp10_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ for (i = 0; i < 16; ++i) {
+ vp10_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp10_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6),
+ bd);
+ }
+ }
+}
+
+void vp10_highbd_iadst16_c(const tran_low_t *input,
+ tran_low_t *output,
+ int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_low_t x0 = input[15];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[13];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[11];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[9];
+ tran_low_t x7 = input[6];
+ tran_low_t x8 = input[7];
+ tran_low_t x9 = input[8];
+ tran_low_t x10 = input[5];
+ tran_low_t x11 = input[10];
+ tran_low_t x12 = input[3];
+ tran_low_t x13 = input[12];
+ tran_low_t x14 = input[1];
+ tran_low_t x15 = input[14];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ memset(output, 0, 16 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4, bd);
+ x1 = WRAPLOW(s1 + s5, bd);
+ x2 = WRAPLOW(s2 + s6, bd);
+ x3 = WRAPLOW(s3 + s7, bd);
+ x4 = WRAPLOW(s0 - s4, bd);
+ x5 = WRAPLOW(s1 - s5, bd);
+ x6 = WRAPLOW(s2 - s6, bd);
+ x7 = WRAPLOW(s3 - s7, bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = WRAPLOW(s0 + s2, bd);
+ x1 = WRAPLOW(s1 + s3, bd);
+ x2 = WRAPLOW(s0 - s2, bd);
+ x3 = WRAPLOW(s1 - s3, bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+ x8 = WRAPLOW(s8 + s10, bd);
+ x9 = WRAPLOW(s9 + s11, bd);
+ x10 = WRAPLOW(s8 - s10, bd);
+ x11 = WRAPLOW(s9 - s11, bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
+
+ output[0] = WRAPLOW(x0, bd);
+ output[1] = WRAPLOW(-x8, bd);
+ output[2] = WRAPLOW(x12, bd);
+ output[3] = WRAPLOW(-x4, bd);
+ output[4] = WRAPLOW(x6, bd);
+ output[5] = WRAPLOW(x14, bd);
+ output[6] = WRAPLOW(x10, bd);
+ output[7] = WRAPLOW(x2, bd);
+ output[8] = WRAPLOW(x3, bd);
+ output[9] = WRAPLOW(x11, bd);
+ output[10] = WRAPLOW(x15, bd);
+ output[11] = WRAPLOW(x7, bd);
+ output[12] = WRAPLOW(x5, bd);
+ output[13] = WRAPLOW(-x13, bd);
+ output[14] = WRAPLOW(x9, bd);
+ output[15] = WRAPLOW(-x1, bd);
+}
+
+void vp10_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ vp10_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+static void highbd_idct32_c(const tran_low_t *input,
+ tran_low_t *output, int bd) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+ (void) bd;
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+ temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+ temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+ temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+ temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+ temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+ temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+ temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+ temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ step2[16] = WRAPLOW(step1[16] + step1[17], bd);
+ step2[17] = WRAPLOW(step1[16] - step1[17], bd);
+ step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
+ step2[19] = WRAPLOW(step1[18] + step1[19], bd);
+ step2[20] = WRAPLOW(step1[20] + step1[21], bd);
+ step2[21] = WRAPLOW(step1[20] - step1[21], bd);
+ step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
+ step2[23] = WRAPLOW(step1[22] + step1[23], bd);
+ step2[24] = WRAPLOW(step1[24] + step1[25], bd);
+ step2[25] = WRAPLOW(step1[24] - step1[25], bd);
+ step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
+ step2[27] = WRAPLOW(step1[26] + step1[27], bd);
+ step2[28] = WRAPLOW(step1[28] + step1[29], bd);
+ step2[29] = WRAPLOW(step1[28] - step1[29], bd);
+ step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
+ step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+ step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+ step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+ step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+ step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+ step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+ step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+ step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+ step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+ temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+ temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+ temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+ temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = WRAPLOW(step1[16] + step1[19], bd);
+ step2[17] = WRAPLOW(step1[17] + step1[18], bd);
+ step2[18] = WRAPLOW(step1[17] - step1[18], bd);
+ step2[19] = WRAPLOW(step1[16] - step1[19], bd);
+ step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
+ step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
+ step2[22] = WRAPLOW(step1[21] + step1[22], bd);
+ step2[23] = WRAPLOW(step1[20] + step1[23], bd);
+
+ step2[24] = WRAPLOW(step1[24] + step1[27], bd);
+ step2[25] = WRAPLOW(step1[25] + step1[26], bd);
+ step2[26] = WRAPLOW(step1[25] - step1[26], bd);
+ step2[27] = WRAPLOW(step1[24] - step1[27], bd);
+ step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
+ step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
+ step2[30] = WRAPLOW(step1[29] + step1[30], bd);
+ step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+ step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+ step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+ step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+ step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+ step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+ step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+ step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+ step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+ step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+ step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+ temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+ temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+ temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+ temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+ step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+ step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+ step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+ step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+ step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+ step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+ step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = WRAPLOW(step1[16] + step1[23], bd);
+ step2[17] = WRAPLOW(step1[17] + step1[22], bd);
+ step2[18] = WRAPLOW(step1[18] + step1[21], bd);
+ step2[19] = WRAPLOW(step1[19] + step1[20], bd);
+ step2[20] = WRAPLOW(step1[19] - step1[20], bd);
+ step2[21] = WRAPLOW(step1[18] - step1[21], bd);
+ step2[22] = WRAPLOW(step1[17] - step1[22], bd);
+ step2[23] = WRAPLOW(step1[16] - step1[23], bd);
+
+ step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
+ step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
+ step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
+ step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
+ step2[28] = WRAPLOW(step1[27] + step1[28], bd);
+ step2[29] = WRAPLOW(step1[26] + step1[29], bd);
+ step2[30] = WRAPLOW(step1[25] + step1[30], bd);
+ step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+
+ // stage 7
+ step1[0] = WRAPLOW(step2[0] + step2[15], bd);
+ step1[1] = WRAPLOW(step2[1] + step2[14], bd);
+ step1[2] = WRAPLOW(step2[2] + step2[13], bd);
+ step1[3] = WRAPLOW(step2[3] + step2[12], bd);
+ step1[4] = WRAPLOW(step2[4] + step2[11], bd);
+ step1[5] = WRAPLOW(step2[5] + step2[10], bd);
+ step1[6] = WRAPLOW(step2[6] + step2[9], bd);
+ step1[7] = WRAPLOW(step2[7] + step2[8], bd);
+ step1[8] = WRAPLOW(step2[7] - step2[8], bd);
+ step1[9] = WRAPLOW(step2[6] - step2[9], bd);
+ step1[10] = WRAPLOW(step2[5] - step2[10], bd);
+ step1[11] = WRAPLOW(step2[4] - step2[11], bd);
+ step1[12] = WRAPLOW(step2[3] - step2[12], bd);
+ step1[13] = WRAPLOW(step2[2] - step2[13], bd);
+ step1[14] = WRAPLOW(step2[1] - step2[14], bd);
+ step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WRAPLOW(step1[0] + step1[31], bd);
+ output[1] = WRAPLOW(step1[1] + step1[30], bd);
+ output[2] = WRAPLOW(step1[2] + step1[29], bd);
+ output[3] = WRAPLOW(step1[3] + step1[28], bd);
+ output[4] = WRAPLOW(step1[4] + step1[27], bd);
+ output[5] = WRAPLOW(step1[5] + step1[26], bd);
+ output[6] = WRAPLOW(step1[6] + step1[25], bd);
+ output[7] = WRAPLOW(step1[7] + step1[24], bd);
+ output[8] = WRAPLOW(step1[8] + step1[23], bd);
+ output[9] = WRAPLOW(step1[9] + step1[22], bd);
+ output[10] = WRAPLOW(step1[10] + step1[21], bd);
+ output[11] = WRAPLOW(step1[11] + step1[20], bd);
+ output[12] = WRAPLOW(step1[12] + step1[19], bd);
+ output[13] = WRAPLOW(step1[13] + step1[18], bd);
+ output[14] = WRAPLOW(step1[14] + step1[17], bd);
+ output[15] = WRAPLOW(step1[15] + step1[16], bd);
+ output[16] = WRAPLOW(step1[15] - step1[16], bd);
+ output[17] = WRAPLOW(step1[14] - step1[17], bd);
+ output[18] = WRAPLOW(step1[13] - step1[18], bd);
+ output[19] = WRAPLOW(step1[12] - step1[19], bd);
+ output[20] = WRAPLOW(step1[11] - step1[20], bd);
+ output[21] = WRAPLOW(step1[10] - step1[21], bd);
+ output[22] = WRAPLOW(step1[9] - step1[22], bd);
+ output[23] = WRAPLOW(step1[8] - step1[23], bd);
+ output[24] = WRAPLOW(step1[7] - step1[24], bd);
+ output[25] = WRAPLOW(step1[6] - step1[25], bd);
+ output[26] = WRAPLOW(step1[5] - step1[26], bd);
+ output[27] = WRAPLOW(step1[4] - step1[27], bd);
+ output[28] = WRAPLOW(step1[3] - step1[28], bd);
+ output[29] = WRAPLOW(step1[2] - step1[29], bd);
+ output[30] = WRAPLOW(step1[1] - step1[30], bd);
+ output[31] = WRAPLOW(step1[0] - step1[31], bd);
+}
+
+void vp10_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_low_t zero_coeff[16];
+ for (j = 0; j < 16; ++j)
+ zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ highbd_idct32_c(input, outptr, bd);
+ else
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ highbd_idct32_c(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[32 * 32] = {0};
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ // Only upper-left 8x8 has non-zero coeff.
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32_c(input, outptr, bd);
+ input += 32;
+ outptr += 32;
+ }
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ highbd_idct32_c(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vp10_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ int a1;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i)
+ dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/vp10_inv_txfm.h b/vp10/common/vp10_inv_txfm.h
new file mode 100644
index 0000000..52611ac
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_INV_TXFM_H_
+#define VPX_DSP_INV_TXFM_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE tran_low_t check_range(tran_high_t input) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid VP9 input streams, intermediate stage coefficients should always
+ // stay within the range of a signed 16 bit integer. Coefficients can go out
+ // of this range for invalid/corrupt VP9 streams. However, strictly checking
+ // this range for every intermediate coefficient can burdensome for a decoder,
+ // therefore the following assertion is only enabled when configured with
+ // --enable-coefficient-range-checking.
+ assert(INT16_MIN <= input);
+ assert(input <= INT16_MAX);
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ return (tran_low_t)input;
+}
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return check_range(rv);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+ int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+ const int32_t int_max = (1 << (7 + bd)) - 1;
+ const int32_t int_min = -int_max - 1;
+ assert(int_min <= input);
+ assert(input <= int_max);
+ (void) int_min;
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ (void) bd;
+ return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+ int bd) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return highbd_check_range(rv, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) ((int32_t)(x))
+#endif // CONFIG_EMULATE_HARDWARE
+
+void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct8_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct16_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct32_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+ int bd) {
+ trans = WRAPLOW(trans, bd);
+ return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
+#endif
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+ trans = WRAPLOW(trans, 8);
+ return clip_pixel(WRAPLOW(dest + trans, 8));
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // VPX_DSP_INV_TXFM_H_
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 37b9622..ca3729c 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -289,6 +289,188 @@
specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
}
+# Inverse transform
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ # Note as optimized versions of these functions are added we need to add a check to ensure
+ # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+ add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_1_add/;
+
+ add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_16_add/;
+
+ add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_1_add/;
+
+ add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_64_add/;
+
+ add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_12_add/;
+
+ add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_1_add/;
+
+ add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_256_add/;
+
+ add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_10_add/;
+
+ add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1024_add/;
+
+ add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_34_add/;
+
+ add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_16_add/;
+
+ add_proto qw/void vp10_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct4x4_1_add/;
+
+ add_proto qw/void vp10_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_1_add/;
+
+ add_proto qw/void vp10_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_1_add/;
+
+ add_proto qw/void vp10_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct32x32_1024_add/;
+
+ add_proto qw/void vp10_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct32x32_34_add/;
+
+ add_proto qw/void vp10_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct32x32_1_add/;
+
+ add_proto qw/void vp10_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_iwht4x4_1_add/;
+
+ add_proto qw/void vp10_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_iwht4x4_16_add/;
+
+ # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+ add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct4x4_16_add/;
+
+ add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_64_add/;
+
+ add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_10_add/;
+
+ add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_256_add/;
+
+ add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_10_add/;
+ } else {
+ add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct4x4_16_add sse2/;
+
+ add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_64_add sse2/;
+
+ add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct8x8_10_add sse2/;
+
+ add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_256_add sse2/;
+
+ add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp10_highbd_idct16x16_10_add sse2/;
+ } # CONFIG_EMULATE_HARDWARE
+} else {
+ # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+ add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_1_add/;
+
+ add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_16_add/;
+
+ add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_1_add/;
+
+ add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_64_add/;
+
+ add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_12_add/;
+
+ add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_1_add/;
+
+ add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_256_add/;
+
+ add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_10_add/;
+
+ add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1024_add/;
+
+ add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_34_add/;
+
+ add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_16_add/;
+ } else {
+ add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_1_add sse2/;
+
+ add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct4x4_16_add sse2/;
+
+ add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_1_add sse2/;
+
+ add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_64_add sse2/;
+
+ add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct8x8_12_add sse2/;
+
+ add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_1_add sse2/;
+
+ add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_256_add sse2/;
+
+ add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct16x16_10_add sse2/;
+
+ add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1024_add sse2/;
+
+ add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_34_add sse2/;
+
+ add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_idct32x32_1_add sse2/;
+
+ add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_1_add/;
+
+ add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp10_iwht4x4_16_add/;
+ } # CONFIG_EMULATE_HARDWARE
+} # CONFIG_VP9_HIGHBITDEPTH
+
#
# Motion search
#
diff --git a/vp10/common/x86/vp10_inv_txfm_sse2.c b/vp10/common/x86/vp10_inv_txfm_sse2.c
new file mode 100644
index 0000000..b25e22e
--- /dev/null
+++ b/vp10/common/x86/vp10_inv_txfm_sse2.c
@@ -0,0 +1,4058 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/x86/vp10_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+#define RECON_AND_STORE4X4(dest, in_x) \
+{ \
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ *(int *)(dest) = _mm_cvtsi128_si32(d0); \
+}
+
+void vp10_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i cst = _mm_setr_epi16(
+ (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+ (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+ (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i input0, input1, input2, input3;
+
+ // Rows
+ input0 = _mm_load_si128((const __m128i *)input);
+ input2 = _mm_load_si128((const __m128i *)(input + 8));
+
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ input0 = _mm_shufflelo_epi16(input0, 0xd8);
+ input0 = _mm_shufflehi_epi16(input0, 0xd8);
+ input2 = _mm_shufflelo_epi16(input2, 0xd8);
+ input2 = _mm_shufflehi_epi16(input2, 0xd8);
+
+ input1 = _mm_unpackhi_epi32(input0, input0);
+ input0 = _mm_unpacklo_epi32(input0, input0);
+ input3 = _mm_unpackhi_epi32(input2, input2);
+ input2 = _mm_unpacklo_epi32(input2, input2);
+
+ // Stage 1
+ input0 = _mm_madd_epi16(input0, cst);
+ input1 = _mm_madd_epi16(input1, cst);
+ input2 = _mm_madd_epi16(input2, cst);
+ input3 = _mm_madd_epi16(input3, cst);
+
+ input0 = _mm_add_epi32(input0, rounding);
+ input1 = _mm_add_epi32(input1, rounding);
+ input2 = _mm_add_epi32(input2, rounding);
+ input3 = _mm_add_epi32(input3, rounding);
+
+ input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+ input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+ input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+ input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+ // Stage 2
+ input0 = _mm_packs_epi32(input0, input1);
+ input1 = _mm_packs_epi32(input2, input3);
+
+ // Transpose
+ input2 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpackhi_epi16(input0, input1);
+ input0 = _mm_unpacklo_epi32(input2, input3);
+ input1 = _mm_unpackhi_epi32(input2, input3);
+
+ // Switch column2, column 3, and then, we got:
+ // input2: column1, column 0; input3: column2, column 3.
+ input1 = _mm_shuffle_epi32(input1, 0x4e);
+ input2 = _mm_add_epi16(input0, input1);
+ input3 = _mm_sub_epi16(input0, input1);
+
+ // Columns
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ input0 = _mm_unpacklo_epi32(input2, input2);
+ input1 = _mm_unpackhi_epi32(input2, input2);
+ input2 = _mm_unpackhi_epi32(input3, input3);
+ input3 = _mm_unpacklo_epi32(input3, input3);
+
+ // Stage 1
+ input0 = _mm_madd_epi16(input0, cst);
+ input1 = _mm_madd_epi16(input1, cst);
+ input2 = _mm_madd_epi16(input2, cst);
+ input3 = _mm_madd_epi16(input3, cst);
+
+ input0 = _mm_add_epi32(input0, rounding);
+ input1 = _mm_add_epi32(input1, rounding);
+ input2 = _mm_add_epi32(input2, rounding);
+ input3 = _mm_add_epi32(input3, rounding);
+
+ input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+ input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+ input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+ input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+ // Stage 2
+ input0 = _mm_packs_epi32(input0, input2);
+ input1 = _mm_packs_epi32(input1, input3);
+
+ // Transpose
+ input2 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpackhi_epi16(input0, input1);
+ input0 = _mm_unpacklo_epi32(input2, input3);
+ input1 = _mm_unpackhi_epi32(input2, input3);
+
+ // Switch column2, column 3, and then, we got:
+ // input2: column1, column 0; input3: column2, column 3.
+ input1 = _mm_shuffle_epi32(input1, 0x4e);
+ input2 = _mm_add_epi16(input0, input1);
+ input3 = _mm_sub_epi16(input0, input1);
+
+ // Final round and shift
+ input2 = _mm_add_epi16(input2, eight);
+ input3 = _mm_add_epi16(input3, eight);
+
+ input2 = _mm_srai_epi16(input2, 4);
+ input3 = _mm_srai_epi16(input3, 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi32(d0,
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d2 = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d2 = _mm_unpacklo_epi8(d2, zero);
+ d0 = _mm_add_epi16(d0, input2);
+ d2 = _mm_add_epi16(d2, input3);
+ d0 = _mm_packus_epi16(d0, d2);
+ // store input0
+ *(int *)dest = _mm_cvtsi128_si32(d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+ // store input2
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+ // store input3
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+ }
+}
+
+void vp10_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 4);
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+ res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+void vp10_idct4_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+
+ transpose_4x4(in);
+ // stage 1
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+ u[0] = _mm_packs_epi32(v[0], v[1]);
+ u[1] = _mm_packs_epi32(v[3], v[2]);
+
+ // stage 2
+ in[0] = _mm_add_epi16(u[0], u[1]);
+ in[1] = _mm_sub_epi16(u[0], u[1]);
+ in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+void vp10_iadst4_sse2(__m128i *in) {
+ const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+ const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8], in7;
+
+ transpose_4x4(in);
+ in7 = _mm_srli_si128(in[1], 8);
+ in7 = _mm_add_epi16(in7, in[0]);
+ in7 = _mm_sub_epi16(in7, in[1]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpackhi_epi16(in[0], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(v[3], v[4]);
+ u[2] = v[2];
+ u[3] = _mm_add_epi32(u[0], u[1]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_add_epi32(u[3], v[5]);
+ u[6] = _mm_sub_epi32(u[5], u[4]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+ out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+ out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+ out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+ }
+
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
+ out0, out1, out2, out3) \
+ { \
+ const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+ const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+ cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ tmp4 = _mm_madd_epi16(lo_1, cst2); \
+ tmp5 = _mm_madd_epi16(hi_1, cst2); \
+ tmp6 = _mm_madd_epi16(lo_1, cst3); \
+ tmp7 = _mm_madd_epi16(hi_1, cst3); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp4 = _mm_add_epi32(tmp4, rounding); \
+ tmp5 = _mm_add_epi32(tmp5, rounding); \
+ tmp6 = _mm_add_epi32(tmp6, rounding); \
+ tmp7 = _mm_add_epi32(tmp7, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ res2 = _mm_packs_epi32(tmp4, tmp5); \
+ res3 = _mm_packs_epi32(tmp6, tmp7); \
+ }
+
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ }
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_4, \
+ stp1_7, stp1_5, stp1_6) \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+ const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_0, \
+ stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+ tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+ tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ } \
+ \
+ /* Stage4 */ \
+ out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+ out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+ out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+ out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+ out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+ out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+ out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+ out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+ }
+
+void vp10_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data.
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ // 8x8 Transpose is copied from vp10_fdct8x8_sse2()
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ // 4-stage 1D vp10_idct8x8
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ }
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void vp10_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 5);
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE(dest + 0 * stride, dc_value);
+ RECON_AND_STORE(dest + 1 * stride, dc_value);
+ RECON_AND_STORE(dest + 2 * stride, dc_value);
+ RECON_AND_STORE(dest + 3 * stride, dc_value);
+ RECON_AND_STORE(dest + 4 * stride, dc_value);
+ RECON_AND_STORE(dest + 5 * stride, dc_value);
+ RECON_AND_STORE(dest + 6 * stride, dc_value);
+ RECON_AND_STORE(dest + 7 * stride, dc_value);
+}
+
+void vp10_idct8_sse2(__m128i *in) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ // 8x8 Transpose is copied from vp10_fdct8x8_sse2()
+ TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ // 4-stage 1D vp10_idct8x8
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+}
+
+void vp10_iadst8_sse2(__m128i *in) {
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // transpose
+ array_transpose_8x8(in, in);
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[1] = _mm_packs_epi32(u2, u3);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[3] = _mm_packs_epi32(u6, u7);
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_add_epi16(in[0], in[2]);
+ s1 = _mm_add_epi16(in[1], in[3]);
+ s2 = _mm_sub_epi16(in[0], in[2]);
+ s3 = _mm_sub_epi16(in[1], in[3]);
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+void vp10_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ // Rows. Load 4-row input data.
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+
+ // 8x4 Transpose
+ TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+ // Stage1
+ {
+ const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
+ const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
+
+ tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+ tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+ tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+ tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+ stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+ }
+
+ // Stage2
+ {
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
+ const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
+
+ tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+ tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+ tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+ tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp2_0 = _mm_packs_epi32(tmp0, tmp2);
+ stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+
+ tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
+ tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+
+ stp2_4 = tmp0;
+ stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+
+ tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
+ tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+
+ stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
+ stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+
+ tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+ }
+
+ // Stage4
+ tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
+ tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
+ tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
+ tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+
+ TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+ IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+#define IDCT16 \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+ const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
+ const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
+ const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+ const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+ stg2_0, stg2_1, stg2_2, stg2_3, \
+ stp2_8, stp2_15, stp2_9, stp2_14) \
+ \
+ MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+ stg2_4, stg2_5, stg2_6, stg2_7, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+ const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+ const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+ stg3_0, stg3_1, stg3_2, stg3_3, \
+ stp1_4, stp1_7, stp1_5, stp1_6) \
+ \
+ stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ \
+ stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+ const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+ const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+ stg4_0, stg4_1, stg4_2, stg4_3, \
+ stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+ stg4_4, stg4_5, stg4_6, stg4_7, \
+ stp2_9, stp2_14, stp2_10, stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ }
+
+#define IDCT16_10 \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+ stg2_0, stg2_1, stg2_6, stg2_7, \
+ stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+ stg3_0, stg3_1, \
+ stp2_4, stp2_7) \
+ \
+ stp1_9 = stp1_8_0; \
+ stp1_10 = stp1_11; \
+ \
+ stp1_13 = stp1_12_0; \
+ stp1_14 = stp1_15; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+ stg4_0, stg4_1, \
+ stp1_0, stp1_1) \
+ stp2_5 = stp2_4; \
+ stp2_6 = stp2_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+ stg4_4, stg4_5, stg4_6, stg4_7, \
+ stp2_9, stp2_14, stp2_10, stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_2 = stp1_1; \
+ stp1_3 = stp1_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ }
+
+void vp10_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[16], l[16], r[16], *curr1;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ curr1 = l;
+ for (i = 0; i < 2; i++) {
+ // 1-D vp10_idct
+
+ // Load input data.
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+ in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+ in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+ in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+ in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+ in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+ in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+ in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+
+ IDCT16
+
+ // Stage7
+ curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+ curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+ curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+ curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+ curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+ curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+ curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+ curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+ curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ curr1 = r;
+ input += 128;
+ }
+ for (i = 0; i < 2; i++) {
+ int j;
+ // 1-D vp10_idct
+ array_transpose_8x8(l + i * 8, in);
+ array_transpose_8x8(r + i * 8, in + 8);
+
+ IDCT16
+
+ // 2-D
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ for (j = 0; j < 16; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vp10_idct16x16_1_add_sse2(const int16_t *input,
+ uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, i;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (i = 0; i < 2; ++i) {
+ RECON_AND_STORE(dest + 0 * stride, dc_value);
+ RECON_AND_STORE(dest + 1 * stride, dc_value);
+ RECON_AND_STORE(dest + 2 * stride, dc_value);
+ RECON_AND_STORE(dest + 3 * stride, dc_value);
+ RECON_AND_STORE(dest + 4 * stride, dc_value);
+ RECON_AND_STORE(dest + 5 * stride, dc_value);
+ RECON_AND_STORE(dest + 6 * stride, dc_value);
+ RECON_AND_STORE(dest + 7 * stride, dc_value);
+ RECON_AND_STORE(dest + 8 * stride, dc_value);
+ RECON_AND_STORE(dest + 9 * stride, dc_value);
+ RECON_AND_STORE(dest + 10 * stride, dc_value);
+ RECON_AND_STORE(dest + 11 * stride, dc_value);
+ RECON_AND_STORE(dest + 12 * stride, dc_value);
+ RECON_AND_STORE(dest + 13 * stride, dc_value);
+ RECON_AND_STORE(dest + 14 * stride, dc_value);
+ RECON_AND_STORE(dest + 15 * stride, dc_value);
+ dest += 8;
+ }
+}
+
+static void vp10_iadst16_8col(__m128i *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+ v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+ v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+ v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+ v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+ v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+ v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+ v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+ v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+ v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+ u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+ u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+ u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+ u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+ u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+ u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+ u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[4] = _mm_packs_epi32(v[4], v[5]);
+ in[5] = _mm_packs_epi32(v[12], v[13]);
+ in[6] = _mm_packs_epi32(v[8], v[9]);
+ in[7] = _mm_packs_epi32(v[0], v[1]);
+ in[8] = _mm_packs_epi32(v[2], v[3]);
+ in[9] = _mm_packs_epi32(v[10], v[11]);
+ in[10] = _mm_packs_epi32(v[14], v[15]);
+ in[11] = _mm_packs_epi32(v[6], v[7]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void vp10_idct16_8col(__m128i *in) {
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i v[16], u[16], s[16], t[16];
+
+ // stage 1
+ s[0] = in[0];
+ s[1] = in[8];
+ s[2] = in[4];
+ s[3] = in[12];
+ s[4] = in[2];
+ s[5] = in[10];
+ s[6] = in[6];
+ s[7] = in[14];
+ s[8] = in[1];
+ s[9] = in[9];
+ s[10] = in[5];
+ s[11] = in[13];
+ s[12] = in[3];
+ s[13] = in[11];
+ s[14] = in[7];
+ s[15] = in[15];
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+ u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+ u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+ u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[8] = _mm_packs_epi32(u[0], u[1]);
+ s[15] = _mm_packs_epi32(u[2], u[3]);
+ s[9] = _mm_packs_epi32(u[4], u[5]);
+ s[14] = _mm_packs_epi32(u[6], u[7]);
+ s[10] = _mm_packs_epi32(u[8], u[9]);
+ s[13] = _mm_packs_epi32(u[10], u[11]);
+ s[11] = _mm_packs_epi32(u[12], u[13]);
+ s[12] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ t[0] = s[0];
+ t[1] = s[1];
+ t[2] = s[2];
+ t[3] = s[3];
+ u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+ u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+ u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+ u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[4] = _mm_packs_epi32(u[0], u[1]);
+ t[7] = _mm_packs_epi32(u[2], u[3]);
+ t[5] = _mm_packs_epi32(u[4], u[5]);
+ t[6] = _mm_packs_epi32(u[6], u[7]);
+ t[8] = _mm_add_epi16(s[8], s[9]);
+ t[9] = _mm_sub_epi16(s[8], s[9]);
+ t[10] = _mm_sub_epi16(s[11], s[10]);
+ t[11] = _mm_add_epi16(s[10], s[11]);
+ t[12] = _mm_add_epi16(s[12], s[13]);
+ t[13] = _mm_sub_epi16(s[12], s[13]);
+ t[14] = _mm_sub_epi16(s[15], s[14]);
+ t[15] = _mm_add_epi16(s[14], s[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+ u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+ u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+ u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+ u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+ u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+ u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_add_epi16(t[4], t[5]);
+ s[5] = _mm_sub_epi16(t[4], t[5]);
+ s[6] = _mm_sub_epi16(t[7], t[6]);
+ s[7] = _mm_add_epi16(t[6], t[7]);
+ s[8] = t[8];
+ s[15] = t[15];
+ s[9] = _mm_packs_epi32(u[8], u[9]);
+ s[14] = _mm_packs_epi32(u[10], u[11]);
+ s[10] = _mm_packs_epi32(u[12], u[13]);
+ s[13] = _mm_packs_epi32(u[14], u[15]);
+ s[11] = t[11];
+ s[12] = t[12];
+
+ // stage 5
+ t[0] = _mm_add_epi16(s[0], s[3]);
+ t[1] = _mm_add_epi16(s[1], s[2]);
+ t[2] = _mm_sub_epi16(s[1], s[2]);
+ t[3] = _mm_sub_epi16(s[0], s[3]);
+ t[4] = s[4];
+ t[7] = s[7];
+
+ u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+ u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ t[5] = _mm_packs_epi32(u[0], u[1]);
+ t[6] = _mm_packs_epi32(u[2], u[3]);
+
+ t[8] = _mm_add_epi16(s[8], s[11]);
+ t[9] = _mm_add_epi16(s[9], s[10]);
+ t[10] = _mm_sub_epi16(s[9], s[10]);
+ t[11] = _mm_sub_epi16(s[8], s[11]);
+ t[12] = _mm_sub_epi16(s[15], s[12]);
+ t[13] = _mm_sub_epi16(s[14], s[13]);
+ t[14] = _mm_add_epi16(s[13], s[14]);
+ t[15] = _mm_add_epi16(s[12], s[15]);
+
+ // stage 6
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ s[8] = t[8];
+ s[9] = t[9];
+
+ u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+ u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+ u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ s[10] = _mm_packs_epi32(u[0], u[1]);
+ s[13] = _mm_packs_epi32(u[2], u[3]);
+ s[11] = _mm_packs_epi32(u[4], u[5]);
+ s[12] = _mm_packs_epi32(u[6], u[7]);
+ s[14] = t[14];
+ s[15] = t[15];
+
+ // stage 7
+ in[0] = _mm_add_epi16(s[0], s[15]);
+ in[1] = _mm_add_epi16(s[1], s[14]);
+ in[2] = _mm_add_epi16(s[2], s[13]);
+ in[3] = _mm_add_epi16(s[3], s[12]);
+ in[4] = _mm_add_epi16(s[4], s[11]);
+ in[5] = _mm_add_epi16(s[5], s[10]);
+ in[6] = _mm_add_epi16(s[6], s[9]);
+ in[7] = _mm_add_epi16(s[7], s[8]);
+ in[8] = _mm_sub_epi16(s[7], s[8]);
+ in[9] = _mm_sub_epi16(s[6], s[9]);
+ in[10] = _mm_sub_epi16(s[5], s[10]);
+ in[11] = _mm_sub_epi16(s[4], s[11]);
+ in[12] = _mm_sub_epi16(s[3], s[12]);
+ in[13] = _mm_sub_epi16(s[2], s[13]);
+ in[14] = _mm_sub_epi16(s[1], s[14]);
+ in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+void vp10_idct16_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ vp10_idct16_8col(in0);
+ vp10_idct16_8col(in1);
+}
+
+void vp10_iadst16_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ vp10_iadst16_8col(in0);
+ vp10_iadst16_8col(in1);
+}
+
+void vp10_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i in[16], l[16];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+ // First 1-D inverse DCT
+ // Load input data.
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+
+ TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+ // Stage2
+ {
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
+
+ tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+ tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+ tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+ tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+ stp2_8 = _mm_packs_epi32(tmp0, tmp2);
+ stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+
+ tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+ tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+ stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+ }
+
+ // Stage4
+ {
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+ tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+ tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+ tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+ tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+ tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+ tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+ stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+ stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+ stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+ stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+ stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+ }
+
+ // Stage5 and Stage6
+ {
+ tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+ tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+ tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+ tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+ stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
+ stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+ stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
+ stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+ stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+ stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+ stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+ stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+ }
+
+ // Stage6
+ {
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+ tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+ tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+ tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+ tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+ tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+ tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
+ stp2_10 = _mm_packs_epi32(tmp0, zero);
+ stp2_13 = _mm_packs_epi32(tmp2, zero);
+ stp2_11 = _mm_packs_epi32(tmp4, zero);
+ stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+ tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+ tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+ tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+ tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+ stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+ stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+ stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+ stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+ stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+ stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+ stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+ }
+
+ // Stage7. Left 8x16 only.
+ l[0] = _mm_add_epi16(stp2_0, stp1_15);
+ l[1] = _mm_add_epi16(stp2_1, stp1_14);
+ l[2] = _mm_add_epi16(stp2_2, stp2_13);
+ l[3] = _mm_add_epi16(stp2_3, stp2_12);
+ l[4] = _mm_add_epi16(stp2_4, stp2_11);
+ l[5] = _mm_add_epi16(stp2_5, stp2_10);
+ l[6] = _mm_add_epi16(stp2_6, stp1_9);
+ l[7] = _mm_add_epi16(stp2_7, stp1_8);
+ l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ // Second 1-D inverse transform, performed per 8x16 block
+ for (i = 0; i < 2; i++) {
+ int j;
+ array_transpose_4X8(l + 8 * i, in);
+
+ IDCT16_10
+
+ // Stage7
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ for (j = 0; j < 16; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+#define LOAD_DQCOEFF(reg, input) \
+ { \
+ reg = _mm_load_si128((const __m128i *) input); \
+ input += 8; \
+ } \
+
+#define IDCT32_34 \
+/* Stage1 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+ \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+ \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+ stg1_1, stp1_16, stp1_31); \
+ MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+ stg1_7, stp1_19, stp1_28); \
+ MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+ stg1_9, stp1_20, stp1_27); \
+ MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+ stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+ stg2_1, stp2_8, stp2_15); \
+ MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+ stg2_7, stp2_11, stp2_12); \
+ \
+ stp2_16 = stp1_16; \
+ stp2_19 = stp1_19; \
+ \
+ stp2_20 = stp1_20; \
+ stp2_23 = stp1_23; \
+ \
+ stp2_24 = stp1_24; \
+ stp2_27 = stp1_27; \
+ \
+ stp2_28 = stp1_28; \
+ stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+ stg3_1, stp1_4, stp1_7); \
+ \
+ stp1_8 = stp2_8; \
+ stp1_11 = stp2_11; \
+ stp1_12 = stp2_12; \
+ stp1_15 = stp2_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+ stp1_18, stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+ stp1_22, stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+ stg4_1, stp2_0, stp2_1); \
+ \
+ stp2_4 = stp1_4; \
+ stp2_5 = stp1_4; \
+ stp2_6 = stp1_7; \
+ stp2_7 = stp1_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+ stp2_10, stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = stp2_0; \
+ stp1_1 = stp2_1; \
+ stp1_2 = stp2_1; \
+ stp1_3 = stp2_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+ stp2_13, stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+}
+
+
+#define IDCT32 \
+/* Stage1 */ \
+{ \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+ \
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+ \
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+ stp1_17, stp1_30) \
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+ stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+ stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
+ \
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+ stp2_14) \
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+ stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+ stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+ \
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+ \
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ \
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+ stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+ stp1_6) \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+ stp1_18, stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+ stp1_22, stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+ stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+ stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+ stp2_10, stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+ stp2_13, stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp10_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+ // vp10_idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[32], col[32];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+ stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+ stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+ stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+ stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[1] = _mm_load_si128((const __m128i *)(input + 32));
+ in[2] = _mm_load_si128((const __m128i *)(input + 64));
+ in[3] = _mm_load_si128((const __m128i *)(input + 96));
+ in[4] = _mm_load_si128((const __m128i *)(input + 128));
+ in[5] = _mm_load_si128((const __m128i *)(input + 160));
+ in[6] = _mm_load_si128((const __m128i *)(input + 192));
+ in[7] = _mm_load_si128((const __m128i *)(input + 224));
+
+ for (i = 8; i < 32; ++i) {
+ in[i] = _mm_setzero_si128();
+ }
+
+ array_transpose_8x8(in, in);
+ // TODO(hkuang): Following transposes are unnecessary. But remove them will
+ // lead to performance drop on some devices.
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ IDCT32_34
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+ for (i = 0; i < 4; i++) {
+ int j;
+ const __m128i zero = _mm_setzero_si128();
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + i * 8, in);
+ IDCT32_34
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vp10_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+
+ // vp10_idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in[32], col[128], zero_idx[16];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+ stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+ stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+ stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+ stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i, j, i32;
+
+ for (i = 0; i < 4; i++) {
+ i32 = (i << 5);
+ // First 1-D vp10_idct
+ // Load input data.
+ LOAD_DQCOEFF(in[0], input);
+ LOAD_DQCOEFF(in[8], input);
+ LOAD_DQCOEFF(in[16], input);
+ LOAD_DQCOEFF(in[24], input);
+ LOAD_DQCOEFF(in[1], input);
+ LOAD_DQCOEFF(in[9], input);
+ LOAD_DQCOEFF(in[17], input);
+ LOAD_DQCOEFF(in[25], input);
+ LOAD_DQCOEFF(in[2], input);
+ LOAD_DQCOEFF(in[10], input);
+ LOAD_DQCOEFF(in[18], input);
+ LOAD_DQCOEFF(in[26], input);
+ LOAD_DQCOEFF(in[3], input);
+ LOAD_DQCOEFF(in[11], input);
+ LOAD_DQCOEFF(in[19], input);
+ LOAD_DQCOEFF(in[27], input);
+
+ LOAD_DQCOEFF(in[4], input);
+ LOAD_DQCOEFF(in[12], input);
+ LOAD_DQCOEFF(in[20], input);
+ LOAD_DQCOEFF(in[28], input);
+ LOAD_DQCOEFF(in[5], input);
+ LOAD_DQCOEFF(in[13], input);
+ LOAD_DQCOEFF(in[21], input);
+ LOAD_DQCOEFF(in[29], input);
+ LOAD_DQCOEFF(in[6], input);
+ LOAD_DQCOEFF(in[14], input);
+ LOAD_DQCOEFF(in[22], input);
+ LOAD_DQCOEFF(in[30], input);
+ LOAD_DQCOEFF(in[7], input);
+ LOAD_DQCOEFF(in[15], input);
+ LOAD_DQCOEFF(in[23], input);
+ LOAD_DQCOEFF(in[31], input);
+
+ // checking if all entries are zero
+ zero_idx[0] = _mm_or_si128(in[0], in[1]);
+ zero_idx[1] = _mm_or_si128(in[2], in[3]);
+ zero_idx[2] = _mm_or_si128(in[4], in[5]);
+ zero_idx[3] = _mm_or_si128(in[6], in[7]);
+ zero_idx[4] = _mm_or_si128(in[8], in[9]);
+ zero_idx[5] = _mm_or_si128(in[10], in[11]);
+ zero_idx[6] = _mm_or_si128(in[12], in[13]);
+ zero_idx[7] = _mm_or_si128(in[14], in[15]);
+ zero_idx[8] = _mm_or_si128(in[16], in[17]);
+ zero_idx[9] = _mm_or_si128(in[18], in[19]);
+ zero_idx[10] = _mm_or_si128(in[20], in[21]);
+ zero_idx[11] = _mm_or_si128(in[22], in[23]);
+ zero_idx[12] = _mm_or_si128(in[24], in[25]);
+ zero_idx[13] = _mm_or_si128(in[26], in[27]);
+ zero_idx[14] = _mm_or_si128(in[28], in[29]);
+ zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+ zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+ zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+ zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+ if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+ col[i32 + 0] = _mm_setzero_si128();
+ col[i32 + 1] = _mm_setzero_si128();
+ col[i32 + 2] = _mm_setzero_si128();
+ col[i32 + 3] = _mm_setzero_si128();
+ col[i32 + 4] = _mm_setzero_si128();
+ col[i32 + 5] = _mm_setzero_si128();
+ col[i32 + 6] = _mm_setzero_si128();
+ col[i32 + 7] = _mm_setzero_si128();
+ col[i32 + 8] = _mm_setzero_si128();
+ col[i32 + 9] = _mm_setzero_si128();
+ col[i32 + 10] = _mm_setzero_si128();
+ col[i32 + 11] = _mm_setzero_si128();
+ col[i32 + 12] = _mm_setzero_si128();
+ col[i32 + 13] = _mm_setzero_si128();
+ col[i32 + 14] = _mm_setzero_si128();
+ col[i32 + 15] = _mm_setzero_si128();
+ col[i32 + 16] = _mm_setzero_si128();
+ col[i32 + 17] = _mm_setzero_si128();
+ col[i32 + 18] = _mm_setzero_si128();
+ col[i32 + 19] = _mm_setzero_si128();
+ col[i32 + 20] = _mm_setzero_si128();
+ col[i32 + 21] = _mm_setzero_si128();
+ col[i32 + 22] = _mm_setzero_si128();
+ col[i32 + 23] = _mm_setzero_si128();
+ col[i32 + 24] = _mm_setzero_si128();
+ col[i32 + 25] = _mm_setzero_si128();
+ col[i32 + 26] = _mm_setzero_si128();
+ col[i32 + 27] = _mm_setzero_si128();
+ col[i32 + 28] = _mm_setzero_si128();
+ col[i32 + 29] = _mm_setzero_si128();
+ col[i32 + 30] = _mm_setzero_si128();
+ col[i32 + 31] = _mm_setzero_si128();
+ continue;
+ }
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ IDCT32
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ }
+ for (i = 0; i < 4; i++) {
+ // Second 1-D vp10_idct
+ j = i << 3;
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + j, in);
+ array_transpose_8x8(col + j + 32, in + 8);
+ array_transpose_8x8(col + j + 64, in + 16);
+ array_transpose_8x8(col + j + 96, in + 24);
+
+ IDCT32
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vp10_idct32x32_1_add_sse2(const int16_t *input,
+ uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, i;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (i = 0; i < 4; ++i) {
+ int j;
+ for (j = 0; j < 32; ++j) {
+ RECON_AND_STORE(dest + j * stride, dc_value);
+ }
+ dest += 8;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+ __m128i ubounded, retval;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+ ubounded = _mm_cmpgt_epi16(value, max);
+ retval = _mm_andnot_si128(ubounded, value);
+ ubounded = _mm_and_si128(ubounded, max);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+ return retval;
+}
+
+void vp10_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ __m128i inptr[4];
+ __m128i sign_bits[2];
+ __m128i temp_mm, min_input, max_input;
+ int test;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int optimised_cols = 0;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i max = _mm_set1_epi16(12043);
+ const __m128i min = _mm_set1_epi16(-12043);
+ // Load input into __m128i
+ inptr[0] = _mm_loadu_si128((const __m128i *)input);
+ inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+ inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+ inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+ // Pack to 16 bits
+ inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+ inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (!test) {
+ // Do the row transform
+ vp10_idct4_sse2(inptr);
+
+ // Check the min & max values
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (test) {
+ transpose_4x4(inptr);
+ sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+ sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+ inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+ inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+ inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+ inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+ _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+ _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+ _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+ _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct4_c(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct4_sse2(inptr);
+
+ // Final round and shift
+ inptr[0] = _mm_add_epi16(inptr[0], eight);
+ inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+ inptr[0] = _mm_srai_epi16(inptr[0], 4);
+ inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+ __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi64(
+ d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+ d2 = _mm_unpacklo_epi64(
+ d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+ d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+ d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+ // store input0
+ _mm_storel_epi64((__m128i *)dest, d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride), d0);
+ // store input2
+ _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+ // store input3
+ d2 = _mm_srli_si128(d2, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[4], temp_out[4];
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ vp10_highbd_idct4_c(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+ }
+}
+
+void vp10_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[8];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ const __m128i max = _mm_set1_epi16(6201);
+ const __m128i min = _mm_set1_epi16(-6201);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 8; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ vp10_idct8_sse2(inptr);
+
+ // Find the min & max for the column transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ array_transpose_8x8(inptr, inptr);
+ for (i = 0; i < 8; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 8; ++i) {
+ vp10_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct8_sse2(inptr);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[8];
+ for (i = 0; i < 8; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ inptr[i] = _mm_srai_epi16(inptr[i], 5);
+ d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[8], temp_out[8];
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+ }
+}
+
+void vp10_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[8];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ const __m128i max = _mm_set1_epi16(6201);
+ const __m128i min = _mm_set1_epi16(-6201);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 8; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ // only first 4 row has non-zero coefs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 4; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ vp10_idct8_sse2(inptr);
+
+ // Find the min & max for the column transform
+ // N.B. Only first 4 cols contain non-zero coeffs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ // Use fact only first 4 rows contain non-zero coeffs
+ array_transpose_4X8(inptr, inptr);
+ for (i = 0; i < 4; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct8_sse2(inptr);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[8];
+ for (i = 0; i < 8; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ inptr[i] = _mm_srai_epi16(inptr[i], 5);
+ d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[8], temp_out[8];
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp10_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+ }
+}
+
+void vp10_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[32];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_set1_epi16(32);
+ const __m128i max = _mm_set1_epi16(3155);
+ const __m128i min = _mm_set1_epi16(-3155);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 16; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+ inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 32; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ vp10_idct16_sse2(inptr, inptr + 16);
+
+ // Find the min & max for the column transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 32; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ array_transpose_16x16(inptr, inptr + 16);
+ for (i = 0; i < 16; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 16; ++i) {
+ vp10_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct16_sse2(inptr, inptr + 16);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[2];
+ for (i = 0; i < 16; i++) {
+ inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
+ inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+ inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
+ inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[16], temp_out[16];
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp10_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+ }
+}
+
+void vp10_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[32];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_set1_epi16(32);
+ const __m128i max = _mm_set1_epi16(3155);
+ const __m128i min = _mm_set1_epi16(-3155);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 16; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+ inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ // Since all non-zero dct coefficients are in upper-left 4x4 area,
+ // we only need to consider first 4 rows here.
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 4; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform (N.B. This transposes inptr)
+ vp10_idct16_sse2(inptr, inptr + 16);
+
+ // Find the min & max for the column transform
+ // N.B. Only first 4 cols contain non-zero coeffs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 16; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ // Use fact only first 4 rows contain non-zero coeffs
+ array_transpose_8x8(inptr, inptr);
+ array_transpose_8x8(inptr + 8, inptr + 16);
+ for (i = 0; i < 4; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp10_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+ }
+
+ if (optimised_cols) {
+ vp10_idct16_sse2(inptr, inptr + 16);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[2];
+ for (i = 0; i < 16; i++) {
+ inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
+ inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+ inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
+ inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[16], temp_out[16];
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp10_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/x86/vp10_inv_txfm_sse2.h b/vp10/common/x86/vp10_inv_txfm_sse2.h
new file mode 100644
index 0000000..b79781a
--- /dev/null
+++ b/vp10/common/x86/vp10_inv_txfm_sse2.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ \
+ in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
+ in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
+ }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+ out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+ __m128i tbuf[8];
+ array_transpose_8x8(res0, res0);
+ array_transpose_8x8(res1, tbuf);
+ array_transpose_8x8(res0 + 8, res1);
+ array_transpose_8x8(res1 + 8, res1 + 8);
+
+ res0[8] = tbuf[0];
+ res0[9] = tbuf[1];
+ res0[10] = tbuf[2];
+ res0[11] = tbuf[3];
+ res0[12] = tbuf[4];
+ res0[13] = tbuf[5];
+ res0[14] = tbuf[6];
+ res0[15] = tbuf[7];
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
+ in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
+ in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
+ in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
+ in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
+ in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
+ in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
+ in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
+}
+
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ _mm_storel_epi64((__m128i *)(dest), d0); \
+ }
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i zero = _mm_setzero_si128();
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+ RECON_AND_STORE(dest + 4 * stride, in[4]);
+ RECON_AND_STORE(dest + 5 * stride, in[5]);
+ RECON_AND_STORE(dest + 6 * stride, in[6]);
+ RECON_AND_STORE(dest + 7 * stride, in[7]);
+ RECON_AND_STORE(dest + 8 * stride, in[8]);
+ RECON_AND_STORE(dest + 9 * stride, in[9]);
+ RECON_AND_STORE(dest + 10 * stride, in[10]);
+ RECON_AND_STORE(dest + 11 * stride, in[11]);
+ RECON_AND_STORE(dest + 12 * stride, in[12]);
+ RECON_AND_STORE(dest + 13 * stride, in[13]);
+ RECON_AND_STORE(dest + 14 * stride, in[14]);
+ RECON_AND_STORE(dest + 15 * stride, in[15]);
+}
+
+void idct4_sse2(__m128i *in);
+void idct8_sse2(__m128i *in);
+void idct16_sse2(__m128i *in0, __m128i *in1);
+void iadst4_sse2(__m128i *in);
+void iadst8_sse2(__m128i *in);
+void iadst16_sse2(__m128i *in0, __m128i *in1);
+
+#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 16a723c..dc44507 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -19,7 +19,6 @@
VP10_COMMON_SRCS-yes += common/entropymv.c
VP10_COMMON_SRCS-yes += common/frame_buffers.c
VP10_COMMON_SRCS-yes += common/frame_buffers.h
-VP10_COMMON_SRCS-yes += common/idct.c
VP10_COMMON_SRCS-yes += common/alloccommon.h
VP10_COMMON_SRCS-yes += common/blockd.h
VP10_COMMON_SRCS-yes += common/common.h
@@ -30,6 +29,9 @@
VP10_COMMON_SRCS-yes += common/filter.h
VP10_COMMON_SRCS-yes += common/filter.c
VP10_COMMON_SRCS-yes += common/idct.h
+VP10_COMMON_SRCS-yes += common/idct.c
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm.c
VP10_COMMON_SRCS-yes += common/loopfilter.h
VP10_COMMON_SRCS-yes += common/thread_common.h
VP10_COMMON_SRCS-yes += common/mv.h
@@ -91,4 +93,7 @@
VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
endif
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.h
+
$(eval $(call rtcd_h_template,vp10_rtcd,vp10/common/vp10_rtcd_defs.pl))
diff --git a/webmdec.cc b/webmdec.cc
index 1020d04..f541cfe 100644
--- a/webmdec.cc
+++ b/webmdec.cc
@@ -94,7 +94,7 @@
}
}
- if (video_track == NULL) {
+ if (video_track == NULL || video_track->GetCodecId() == NULL) {
rewind_and_reset(webm_ctx, vpx_ctx);
return 0;
}