Merge "Store predicted motion vectors" into nextgenv2
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index bf75a29..dac001f 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc
@@ -16,6 +16,7 @@
#include "test/acm_random.h"
#include "test/util.h"
#include "./vpx_config.h"
+#include "vpx_dsp/psnr.h"
#include "vpx_dsp/ssim.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/msvc.h"
@@ -32,6 +33,19 @@
const YV12_BUFFER_CONFIG *dest,
uint32_t bd);
+double compute_hbd_psnr(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, uint32_t bit_depth) {
+ PSNR_STATS psnr;
+ calc_highbd_psnr(source, dest, &psnr, bit_depth, bit_depth);
+ return psnr.psnr[0];
+}
+
+double compute_psnr(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest) {
+ PSNR_STATS psnr;
+ calc_psnr(source, dest, &psnr);
+ return psnr.psnr[0];
+}
double compute_hbd_psnrhvs(const YV12_BUFFER_CONFIG *source,
const YV12_BUFFER_CONFIG *dest,
@@ -208,5 +222,13 @@
MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, 12,
kPhvs_thresh)));
+INSTANTIATE_TEST_CASE_P(
+ PSNR, HBDMetricsTest,
+ ::testing::Values(
+ MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 10,
+ kPhvs_thresh),
+ MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 12,
+ kPhvs_thresh)));
+
} // namespace
diff --git a/vp10/common/filter.c b/vp10/common/filter.c
index a3aa3cf..a5987f1 100644
--- a/vp10/common/filter.c
+++ b/vp10/common/filter.c
@@ -220,7 +220,7 @@
#if CONFIG_EXT_INTRA
const InterpKernel *vp10_intra_filter_kernels[INTRA_FILTERS] = {
- NULL, // INTRA_FILTER_LINEAR
+ bilinear_filters, // INTRA_FILTER_LINEAR
sub_pel_filters_8, // INTRA_FILTER_8TAP
sub_pel_filters_8sharp, // INTRA_FILTER_8TAP_SHARP
sub_pel_filters_8smooth, // INTRA_FILTER_8TAP_SMOOTH
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index 6f38f74..dbb50fb 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -259,6 +259,73 @@
output[15] = WRAPLOW(-step2[0] + step2[15], 8);
}
+#if CONFIG_EXT_TX
+// For use in lieu of DST
+static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ tran_low_t inputhalf[16];
+ for (i = 0; i < 8; ++i) {
+ output[i] = input[16 + i] * 4;
+ output[24 + i] = input[24 + i] * 4;
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 16; ++i) {
+ inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+ }
+ idct16_c(inputhalf, output + 8);
+ // Note overall scaling factor is 4 times orthogonal
+}
+
+static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ tran_low_t inputhalf[16];
+ for (i = 0; i < 16; ++i) {
+ output[i] = input[16 + i] * 4;
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 16; ++i) {
+ inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+ }
+ idct16_c(inputhalf, output + 16);
+ // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_ihalfcenter32_c(const tran_low_t *input, tran_low_t *output,
+ int bd) {
+ int i;
+ tran_low_t inputhalf[16];
+ for (i = 0; i < 8; ++i) {
+ output[i] = input[16 + i] * 4;
+ output[24 + i] = input[24 + i] * 4;
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 16; ++i) {
+ inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
+ input[i] * Sqrt2, bd);
+ }
+ vpx_highbd_idct16_c(inputhalf, output + 8, bd);
+ // Note overall scaling factor is 4 times orthogonal
+}
+
+static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
+ int bd) {
+ int i;
+ tran_low_t inputhalf[16];
+ for (i = 0; i < 16; ++i) {
+ output[i] = input[16 + i] * 4;
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 16; ++i) {
+ inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
+ input[i] * Sqrt2, bd);
+ }
+ vpx_highbd_idct16_c(inputhalf, output + 16, bd);
+ // Note overall scaling factor is 4 times orthogonal
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // CONFIG_EXT_TX
+
// Inverse identiy transform and add.
static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int bs) {
@@ -808,6 +875,67 @@
}
}
+#if CONFIG_EXT_TX
+void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ static const transform_2d IHT_32[] = {
+ { idct32_c, idct32_c }, // DCT_DCT = 0,
+ { ihalfright32_c, idct32_c }, // ADST_DCT = 1,
+ { idct32_c, ihalfright32_c }, // DCT_ADST = 2,
+ { ihalfright32_c, ihalfright32_c }, // ADST_ADST = 3,
+ { ihalfright32_c, idct32_c }, // FLIPADST_DCT = 4,
+ { idct32_c, ihalfright32_c }, // DCT_FLIPADST = 5,
+ { ihalfright32_c, ihalfright32_c }, // FLIPADST_FLIPADST = 6,
+ { ihalfright32_c, ihalfright32_c }, // ADST_FLIPADST = 7,
+ { ihalfright32_c, ihalfright32_c }, // FLIPADST_ADST = 8,
+ { ihalfcenter32_c, idct32_c }, // DST_DCT = 9,
+ { idct32_c, ihalfcenter32_c }, // DCT_DST = 10,
+ { ihalfcenter32_c, ihalfright32_c }, // DST_ADST = 11,
+ { ihalfright32_c, ihalfcenter32_c }, // ADST_DST = 12,
+ { ihalfcenter32_c, ihalfright32_c }, // DST_FLIPADST = 13,
+ { ihalfright32_c, ihalfcenter32_c }, // FLIPADST_DST = 14,
+ { ihalfcenter32_c, ihalfcenter32_c }, // DST_DST = 15
+ };
+
+ int i, j;
+ tran_low_t tmp;
+ tran_low_t out[32][32];
+ tran_low_t *outp = &out[0][0];
+ int outstride = 32;
+
+ // inverse transform row vectors
+ for (i = 0; i < 32; ++i) {
+ IHT_32[tx_type].rows(input, out[i]);
+ input += 32;
+ }
+
+ // transpose
+ for (i = 1 ; i < 32; i++) {
+ for (j = 0; j < i; j++) {
+ tmp = out[i][j];
+ out[i][j] = out[j][i];
+ out[j][i] = tmp;
+ }
+ }
+
+ // inverse transform column vectors
+ for (i = 0; i < 32; ++i) {
+ IHT_32[tx_type].cols(out[i], out[i]);
+ }
+
+ maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32);
+
+ // Sum with the destination
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) {
+ int d = i * stride + j;
+ int s = j * outstride + i;
+ dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+ }
+ }
+}
+#endif // CONFIG_EXT_TX
+
// idct
void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
@@ -998,15 +1126,27 @@
vp10_idct32x32_add(input, dest, stride, eob);
break;
#if CONFIG_EXT_TX
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case DST_DST:
+ case DST_DCT:
+ case DCT_DST:
+ case DST_ADST:
+ case ADST_DST:
+ case FLIPADST_DST:
+ case DST_FLIPADST:
+ vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
+ break;
case IDTX:
inv_idtx_add_c(input, dest, stride, 32);
break;
#endif // CONFIG_EXT_TX
- case ADST_DCT:
- case DCT_ADST:
- case ADST_ADST:
- assert(0);
- break;
default:
assert(0);
break;
@@ -1212,6 +1352,70 @@
}
}
+#if CONFIG_EXT_TX
+void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int tx_type, int bd) {
+ static const highbd_transform_2d HIGH_IHT_32[] = {
+ { vpx_highbd_idct32_c, vpx_highbd_idct32_c }, // DCT_DCT
+ { highbd_ihalfright32_c, vpx_highbd_idct32_c }, // ADST_DCT
+ { vpx_highbd_idct32_c, highbd_ihalfright32_c }, // DCT_ADST
+ { highbd_ihalfright32_c, highbd_ihalfright32_c }, // ADST_ADST
+ { highbd_ihalfright32_c, vpx_highbd_idct32_c }, // FLIPADST_DCT
+ { vpx_highbd_idct32_c, highbd_ihalfright32_c }, // DCT_FLIPADST
+ { highbd_ihalfright32_c, highbd_ihalfright32_c }, // FLIPADST_FLIPADST
+ { highbd_ihalfright32_c, highbd_ihalfright32_c }, // ADST_FLIPADST
+ { highbd_ihalfright32_c, highbd_ihalfright32_c }, // FLIPADST_ADST
+ { highbd_ihalfcenter32_c, vpx_highbd_idct32_c }, // DST_DCT
+ { vpx_highbd_idct32_c, highbd_ihalfcenter32_c }, // DCT_DST
+ { highbd_ihalfcenter32_c, highbd_ihalfright32_c }, // DST_ADST
+ { highbd_ihalfright32_c, highbd_ihalfcenter32_c }, // ADST_DST
+ { highbd_ihalfcenter32_c, highbd_ihalfright32_c }, // DST_FLIPADST
+ { highbd_ihalfright32_c, highbd_ihalfcenter32_c }, // FLIPADST_DST
+ { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c }, // DST_DST
+ };
+
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ int i, j;
+ tran_low_t tmp;
+ tran_low_t out[32][32];
+ tran_low_t *outp = &out[0][0];
+ int outstride = 32;
+
+ // inverse transform row vectors
+ for (i = 0; i < 32; ++i) {
+ HIGH_IHT_32[tx_type].rows(input, out[i], bd);
+ input += 32;
+ }
+
+ // transpose
+ for (i = 1 ; i < 32; i++) {
+ for (j = 0; j < i; j++) {
+ tmp = out[i][j];
+ out[i][j] = out[j][i];
+ out[j][i] = tmp;
+ }
+ }
+
+ // inverse transform column vectors
+ for (i = 0; i < 32; ++i) {
+ HIGH_IHT_32[tx_type].cols(out[i], out[i], bd);
+ }
+
+ maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32);
+
+ // Sum with the destination
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) {
+ int d = i * stride + j;
+ int s = j * outstride + i;
+ dest[d] = highbd_clip_pixel_add(dest[d],
+ ROUND_POWER_OF_TWO(outp[s], 6), bd);
+ }
+ }
+}
+#endif // CONFIG_EXT_TX
+
// idct
void vp10_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob, int bd) {
@@ -1409,15 +1613,27 @@
vp10_highbd_idct32x32_add(input, dest, stride, eob, bd);
break;
#if CONFIG_EXT_TX
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case DST_DST:
+ case DST_DCT:
+ case DCT_DST:
+ case DST_ADST:
+ case ADST_DST:
+ case FLIPADST_DST:
+ case DST_FLIPADST:
+ vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
+ break;
case IDTX:
highbd_inv_idtx_add_c(input, dest, stride, 32, bd);
break;
#endif // CONFIG_EXT_TX
- case ADST_DCT:
- case DCT_ADST:
- case ADST_ADST:
- assert(0);
- break;
default:
assert(0);
break;
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index f257200..99c3340 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -276,28 +276,95 @@
static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left,
int dx, int dy, INTRA_FILTER filter_type) {
- int r, c, x, y, base, shift, val;
+ int r, c, x, base, shift, val;
(void)left;
(void)dy;
assert(dy == 1);
assert(dx < 0);
- for (r = 0; r < bs; ++r) {
- y = r + 1;
- for (c = 0; c < bs; ++c) {
- x = (c << 8) - y * dx;
+ if (filter_type != INTRA_FILTER_LINEAR) {
+ const int pad_size = SUBPEL_TAPS >> 1;
+ int len;
+ DECLARE_ALIGNED(16, uint8_t, buf[SUBPEL_SHIFTS][64]);
+ DECLARE_ALIGNED(16, uint8_t, src[64 + SUBPEL_TAPS]);
+ uint8_t flags[SUBPEL_SHIFTS];
+
+ memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
+ memset(src, above[0], pad_size * sizeof(above[0]));
+ memcpy(src + pad_size, above, 2 * bs * sizeof(above[0]));
+ memset(src + pad_size + 2 * bs, above[2 * bs - 1],
+ pad_size * sizeof(above[0]));
+ flags[0] = 1;
+ x = -dx;
+ for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
base = x >> 8;
- shift = x - (base << 8);
+ shift = x & 0xFF;
+ shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+ if (shift == SUBPEL_SHIFTS) {
+ base += 1;
+ shift = 0;
+ }
+ len = VPXMIN(bs, 2 * bs - 1 - base);
+ if (len <= 0) {
+ int i;
+ for (i = r; i < bs; ++i) {
+ memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
+ dst += stride;
+ }
+ return;
+ }
+
+ if (len <= (bs >> 1) && !flags[shift]) {
+ base = x >> 8;
+ shift = x & 0xFF;
+ for (c = 0; c < len; ++c) {
+ val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
+ filter_type);
+ dst[c] = clip_pixel(val);
+ ++base;
+ }
+ } else {
+ if (!flags[shift]) {
+ vpx_convolve8_horiz(src + pad_size, 2 * bs, buf[shift], 2 * bs,
+ vp10_intra_filter_kernels[filter_type][shift], 16,
+ NULL, 16, 2 * bs, 2 * bs < 16 ? 2 : 1);
+ flags[shift] = 1;
+ }
+ memcpy(dst, shift == 0 ? src + pad_size + base : &buf[shift][base],
+ len * sizeof(dst[0]));
+ }
+
+ if (len < bs)
+ memset(dst + len, above[2 * bs - 1], (bs - len) * sizeof(dst[0]));
+ }
+ return;
+ }
+
+ // For linear filter, C code is faster.
+ x = -dx;
+ for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
+ base = x >> 8;
+ shift = x & 0xFF;
+
+ if (base >= 2 * bs - 1) {
+ int i;
+ for (i = r; i < bs; ++i) {
+ memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
+ dst += stride;
+ }
+ return;
+ }
+
+ for (c = 0; c < bs; ++c, ++base) {
if (base < 2 * bs - 1) {
- val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
- filter_type);
+ val = above[base] * (256 - shift) + above[base + 1] * shift;
+ val = ROUND_POWER_OF_TWO(val, 8);
dst[c] = clip_pixel(val);
} else {
dst[c] = above[2 * bs - 1];
}
}
- dst += stride;
}
}
@@ -305,33 +372,33 @@
static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left,
int dx, int dy, INTRA_FILTER filter_type) {
- int r, c, x, y, shift, val, base;
+ int r, c, x, y, shift1, shift2, val, base1, base2, use_above;
assert(dx > 0);
assert(dy > 0);
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- y = r + 1;
- x = (c << 8) - y * dx;
- base = x >> 8;
- if (base >= -1) {
- shift = x - (base << 8);
- val = intra_subpel_interp(base, shift, above, -1, bs - 1, filter_type);
+ x = -dx;
+ for (r = 0; r < bs; ++r, x -= dx, dst += stride) {
+ use_above = 0;
+ base1 = x >> 8;
+ y = (r << 8) - dy;
+ for (c = 0; c < bs; ++c, ++base1, y -= dy) {
+ if (base1 >= -1) {
+ shift1 = x & 0xFF;
+ val = intra_subpel_interp(base1, shift1, above, -1, bs - 1,
+ filter_type);
} else {
- x = c + 1;
- y = (r << 8) - x * dy;
- base = y >> 8;
- if (base >= 0) {
- shift = y - (base << 8);
- val = intra_subpel_interp(base, shift, left, 0, bs - 1, filter_type);
+ base2 = y >> 8;
+ if (base2 >= 0) {
+ shift2 = y & 0xFF;
+ val = intra_subpel_interp(base2, shift2, left, 0, bs - 1,
+ filter_type);
} else {
val = left[0];
}
}
dst[c] = clip_pixel(val);
}
- dst += stride;
}
}
@@ -339,7 +406,7 @@
static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left,
int dx, int dy, INTRA_FILTER filter_type) {
- int r, c, x, y, base, shift, val;
+ int r, c, y, base, shift, val;
(void)above;
(void)dx;
@@ -347,21 +414,94 @@
assert(dx == 1);
assert(dy < 0);
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- x = c + 1;
- y = (r << 8) - x * dy;
+ if (filter_type != INTRA_FILTER_LINEAR) {
+ const int pad_size = SUBPEL_TAPS >> 1;
+ int len, i;
+ DECLARE_ALIGNED(16, uint8_t, buf[64][4 * SUBPEL_SHIFTS]);
+ DECLARE_ALIGNED(16, uint8_t, src[(64 + SUBPEL_TAPS) * 4]);
+ uint8_t flags[SUBPEL_SHIFTS];
+
+ memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
+ for (i = 0; i < pad_size; ++i)
+ src[4 * i] = left[0];
+ for (i = 0; i < 2 * bs; ++i)
+ src[4 * (i + pad_size)] = left[i];
+ for (i = 0; i < pad_size; ++i)
+ src[4 * (i + 2 * bs + pad_size)] = left[2 * bs - 1];
+ flags[0] = 1;
+ y = -dy;
+ for (c = 0; c < bs; ++c, y -= dy) {
base = y >> 8;
- shift = y - (base << 8);
- if (base < 2 * bs - 1) {
- val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
- filter_type);
- dst[c] = clip_pixel(val);
+ shift = y & 0xFF;
+ shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+ if (shift == SUBPEL_SHIFTS) {
+ base += 1;
+ shift = 0;
+ }
+ len = VPXMIN(bs, 2 * bs - 1 - base);
+
+ if (len <= 0) {
+ for (i = r; r < bs; ++r) {
+ dst[i * stride + c] = left[ 2 * bs - 1];
+ }
+ continue;
+ }
+
+ if (len <= (bs >> 1) && !flags[shift]) {
+ base = y >> 8;
+ shift = y & 0xFF;
+ for (r = 0; r < len; ++r) {
+ val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
+ filter_type);
+ dst[r * stride + c] = clip_pixel(val);
+ ++base;
+ }
} else {
- dst[c] = left[ 2 * bs - 1];
+ if (!flags[shift]) {
+ vpx_convolve8_vert(src + 4 * pad_size, 4,
+ buf[0] + 4 * shift, 4 * SUBPEL_SHIFTS, NULL, 16,
+ vp10_intra_filter_kernels[filter_type][shift], 16,
+ 2 * bs < 16 ? 4 : 4, 2 * bs);
+ flags[shift] = 1;
+ }
+
+ if (shift == 0) {
+ for (r = 0; r < len; ++r) {
+ dst[r * stride + c] = left[r + base];
+ }
+ } else {
+ for (r = 0; r < len; ++r) {
+ dst[r * stride + c] = buf[r + base][4 * shift];
+ }
+ }
+ }
+
+ if (len < bs) {
+ for (r = len; r < bs; ++r) {
+ dst[r * stride + c] = left[ 2 * bs - 1];
+ }
}
}
- dst += stride;
+ return;
+ }
+
+ // For linear filter, C code is faster.
+ y = -dy;
+ for (c = 0; c < bs; ++c, y -= dy) {
+ base = y >> 8;
+ shift = y & 0xFF;
+
+ for (r = 0; r < bs; ++r, ++base) {
+ if (base < 2 * bs - 1) {
+ val = left[base] * (256 - shift) + left[base + 1] * shift;
+ val = ROUND_POWER_OF_TWO(val, 8);
+ dst[r * stride + c] = clip_pixel(val);
+ } else {
+ for (; r < bs; ++r)
+ dst[r * stride + c] = left[2 * bs - 1];
+ break;
+ }
+ }
}
}
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 9860bae..c9f0295 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -404,6 +404,9 @@
add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_fht16x16 sse2/;
+ add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_fht32x32/;
+
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
} else {
@@ -416,6 +419,9 @@
add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_fht16x16 sse2 msa/;
+ add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_fht32x32/;
+
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
}
@@ -642,6 +648,9 @@
add_proto qw/void vp10_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_highbd_fht16x16/;
+ add_proto qw/void vp10_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_highbd_fht32x32/;
+
add_proto qw/void vp10_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp10_highbd_fwht4x4/;
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index cdb732a..333adbb 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -14,7 +14,6 @@
#include "./vp10_rtcd.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
-
#include "vp10/common/blockd.h"
#include "vp10/common/idct.h"
#include "vpx_dsp/fwd_txfm.h"
@@ -538,7 +537,7 @@
range_check(output, 16, 16);
}
-/* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
+#if CONFIG_EXT_TX
static void fdct32(const tran_low_t *input, tran_low_t *output) {
tran_high_t temp;
tran_low_t step[32];
@@ -936,7 +935,7 @@
range_check(output, 32, 18);
}
-*/
+#endif // CONFIG_EXT_TX
static void fadst4(const tran_low_t *input, tran_low_t *output) {
tran_high_t x0, x1, x2, x3;
@@ -1213,6 +1212,37 @@
}
#if CONFIG_EXT_TX
+// For use in lieu of DST
+static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ tran_low_t inputhalf[16];
+ for (i = 0; i < 8; ++i) {
+ output[16 + i] = input[i] * 4;
+ output[24 + i] = input[24 + i] * 4;
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 16; ++i) {
+ inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 8] * Sqrt2);
+ }
+ fdct16(inputhalf, output);
+ // Note overall scaling factor is 4 times orthogonal
+}
+
+// For use in lieu of ADST
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ tran_low_t inputhalf[16];
+ for (i = 0; i < 16; ++i) {
+ output[16 + i] = input[i] * 4;
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 16; ++i) {
+ inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
+ }
+ fdct16(inputhalf, output);
+ // Note overall scaling factor is 4 times orthogonal
+}
+
static void copy_block(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
int i;
@@ -1375,6 +1405,27 @@
#endif // CONFIG_EXT_TX
};
+#if CONFIG_EXT_TX
+static const transform_2d FHT_32[] = {
+ { fdct32, fdct32 }, // DCT_DCT = 0,
+ { fhalfright32, fdct32 }, // ADST_DCT = 1,
+ { fdct32, fhalfright32 }, // DCT_ADST = 2,
+ { fhalfright32, fhalfright32 }, // ADST_ADST = 3,
+ { fhalfright32, fdct32 }, // FLIPADST_DCT = 4,
+ { fdct32, fhalfright32 }, // DCT_FLIPADST = 5,
+ { fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST = 6,
+ { fhalfright32, fhalfright32 }, // ADST_FLIPADST = 7,
+ { fhalfright32, fhalfright32 }, // FLIPADST_ADST = 8,
+ { fhalfcenter32, fdct32 }, // DST_DCT = 9,
+ { fdct32, fhalfcenter32 }, // DCT_DST = 10,
+ { fhalfcenter32, fhalfright32 }, // DST_ADST = 11,
+ { fhalfright32, fhalfcenter32 }, // ADST_DST = 12,
+ { fhalfcenter32, fhalfright32 }, // DST_FLIPADST = 13,
+ { fhalfright32, fhalfcenter32 }, // FLIPADST_DST = 14,
+ { fhalfcenter32, fhalfcenter32 }, // DST_DST = 15
+};
+#endif // CONFIG_EXT_TX
+
void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
@@ -1671,3 +1722,46 @@
vp10_fht16x16_c(input, output, stride, tx_type);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EXT_TX
+void vp10_fht32x32_c(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ if (tx_type == DCT_DCT) {
+ vpx_fdct32x32_c(input, output, stride);
+ } else {
+ tran_low_t out[1024];
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ const transform_2d ht = FHT_32[tx_type];
+
+ int16_t flipped_input[32 * 32];
+ maybe_flip_input(&input, &stride, 32, flipped_input, tx_type);
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 32; ++j)
+ out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j + i * 32];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 32; ++j)
+ output[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fht32x32_c(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ vp10_fht32x32_c(input, output, stride, tx_type);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // CONFIG_EXT_TX
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 4f97c79..55ec9c1 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -50,7 +50,7 @@
#include "./vp10_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
#if CONFIG_INTERNAL_STATS
#include "vpx_dsp/ssim.h"
#endif
@@ -2033,261 +2033,6 @@
#endif
}
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
-static void encoder_variance(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
- int i, j;
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, uint64_t *sse,
- uint64_t *sum) {
- int i, j;
-
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
- a += a_stride;
- b += b_stride;
- }
-}
-
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h,
- unsigned int *sse, int *sum) {
- uint64_t sse_long = 0;
- uint64_t sum_long = 0;
- encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
- &sse_long, &sum_long);
- *sse = (unsigned int)sse_long;
- *sum = (int)sum_long;
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-static int64_t get_sse(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int width, int height) {
- const int dw = width % 16;
- const int dh = height % 16;
- int64_t total_sse = 0;
- unsigned int sse = 0;
- int sum = 0;
- int x, y;
-
- if (dw > 0) {
- encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
- dw, height, &sse, &sum);
- total_sse += sse;
- }
-
- if (dh > 0) {
- encoder_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
- total_sse += sse;
- }
-
- for (y = 0; y < height / 16; ++y) {
- const uint8_t *pa = a;
- const uint8_t *pb = b;
- for (x = 0; x < width / 16; ++x) {
- vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
- total_sse += sse;
-
- pa += 16;
- pb += 16;
- }
-
- a += 16 * a_stride;
- b += 16 * b_stride;
- }
-
- return total_sse;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int width, int height,
- unsigned int input_shift) {
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- int64_t total_sse = 0;
- int x, y;
- for (y = 0; y < height; ++y) {
- for (x = 0; x < width; ++x) {
- int64_t diff;
- diff = (a[x] >> input_shift) - (b[x] >> input_shift);
- total_sse += diff * diff;
- }
- a += a_stride;
- b += b_stride;
- }
- return total_sse;
-}
-
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int width, int height) {
- int64_t total_sse = 0;
- int x, y;
- const int dw = width % 16;
- const int dh = height % 16;
- unsigned int sse = 0;
- int sum = 0;
- if (dw > 0) {
- encoder_highbd_8_variance(&a[width - dw], a_stride,
- &b[width - dw], b_stride,
- dw, height, &sse, &sum);
- total_sse += sse;
- }
- if (dh > 0) {
- encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
- total_sse += sse;
- }
- for (y = 0; y < height / 16; ++y) {
- const uint8_t *pa = a;
- const uint8_t *pb = b;
- for (x = 0; x < width / 16; ++x) {
- vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
- total_sse += sse;
- pa += 16;
- pb += 16;
- }
- a += 16 * a_stride;
- b += 16 * b_stride;
- }
- return total_sse;
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-typedef struct {
- double psnr[4]; // total/y/u/v
- uint64_t sse[4]; // total/y/u/v
- uint32_t samples[4]; // total/y/u/v
-} PSNR_STATS;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr,
- unsigned int bit_depth,
- unsigned int in_bit_depth) {
- const int widths[3] =
- {a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
- const int heights[3] =
- {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
- const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer };
- const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
- const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer };
- const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
- int i;
- uint64_t total_sse = 0;
- uint32_t total_samples = 0;
- const double peak = (double)((1 << in_bit_depth) - 1);
- const unsigned int input_shift = bit_depth - in_bit_depth;
-
- for (i = 0; i < 3; ++i) {
- const int w = widths[i];
- const int h = heights[i];
- const uint32_t samples = w * h;
- uint64_t sse;
- if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
- if (input_shift) {
- sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i], w, h,
- input_shift);
- } else {
- sse = highbd_get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i], w, h);
- }
- } else {
- sse = get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i],
- w, h);
- }
- psnr->sse[1 + i] = sse;
- psnr->samples[1 + i] = samples;
- psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
- total_sse += sse;
- total_samples += samples;
- }
-
- psnr->sse[0] = total_sse;
- psnr->samples[0] = total_samples;
- psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
- (double)total_sse);
-}
-
-#else // !CONFIG_VP9_HIGHBITDEPTH
-
-static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr) {
- static const double peak = 255.0;
- const int widths[3] = {
- a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
- const int heights[3] = {
- a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
- const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
- const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
- const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
- const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
- int i;
- uint64_t total_sse = 0;
- uint32_t total_samples = 0;
-
- for (i = 0; i < 3; ++i) {
- const int w = widths[i];
- const int h = heights[i];
- const uint32_t samples = w * h;
- const uint64_t sse = get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i],
- w, h);
- psnr->sse[1 + i] = sse;
- psnr->samples[1 + i] = samples;
- psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
- total_sse += sse;
- total_samples += samples;
- }
-
- psnr->sse[0] = total_sse;
- psnr->samples[0] = total_samples;
- psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
- (double)total_sse);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
static void generate_psnr_packet(VP10_COMP *cpi) {
struct vpx_codec_cx_pkt pkt;
@@ -2955,7 +2700,7 @@
vpx_clear_system_state();
- recon_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
if (cpi->twopass.total_left_stats.coded_error != 0.0)
fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
@@ -3380,12 +3125,12 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- kf_err = vp10_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ kf_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
} else {
- kf_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
}
#else
- kf_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
#endif // CONFIG_VP9_HIGHBITDEPTH
// Prevent possible divide by zero error below for perfect KF
@@ -3804,13 +3549,13 @@
if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- cpi->ambient_err = vp10_highbd_get_y_sse(cpi->Source,
+ cpi->ambient_err = vpx_highbd_get_y_sse(cpi->Source,
get_frame_new_buffer(cm));
} else {
- cpi->ambient_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
}
#else
- cpi->ambient_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
#endif // CONFIG_VP9_HIGHBITDEPTH
}
@@ -4557,28 +4302,6 @@
return 0;
}
-int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->y_crop_width == b->y_crop_width);
- assert(a->y_crop_height == b->y_crop_height);
-
- return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp10_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->y_crop_width == b->y_crop_width);
- assert(a->y_crop_height == b->y_crop_height);
- assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
- assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
- return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
int vp10_get_quantizer(VP10_COMP *cpi) {
return cpi->common.base_qindex;
}
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index cc20765..59c7682 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -633,12 +633,6 @@
return get_token_alloc(tile_mb_rows, tile_mb_cols);
}
-int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp10_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b);
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
void vp10_alloc_compressor_data(VP10_COMP *cpi);
void vp10_scale_references(VP10_COMP *cpi);
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index 0f59259..6507f98 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -69,8 +69,6 @@
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
- vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
- break;
case DST_DST:
case DCT_DST:
case DST_DCT:
@@ -78,8 +76,7 @@
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
- // Use C version since DST exists only in C
- vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+ vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 4);
diff --git a/vp10/encoder/picklpf.c b/vp10/encoder/picklpf.c
index f116c00..cb2c1c7 100644
--- a/vp10/encoder/picklpf.c
+++ b/vp10/encoder/picklpf.c
@@ -13,6 +13,7 @@
#include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/psnr.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@@ -56,12 +57,12 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- filt_err = vp10_highbd_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
} else {
- filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
}
#else
- filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
#endif // CONFIG_VP9_HIGHBITDEPTH
// Re-instate the unfiltered frame
diff --git a/vp10/encoder/pickrst.c b/vp10/encoder/pickrst.c
index 79cda43..9982836 100644
--- a/vp10/encoder/pickrst.c
+++ b/vp10/encoder/pickrst.c
@@ -14,6 +14,7 @@
#include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/psnr.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@@ -36,12 +37,12 @@
rsi, 1, partial_frame);
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- filt_err = vp10_highbd_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
} else {
- filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
}
#else
- filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
#endif // CONFIG_VP9_HIGHBITDEPTH
// Re-instate the unfiltered frame
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 8f6c4c7..2b15129 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -539,47 +539,70 @@
cost = token_costs[0][0][pt][EOB_TOKEN];
c = 0;
} else {
- int band_left = *band_count++;
+ if (use_fast_coef_costing) {
+ int band_left = *band_count++;
- // dc token
- int v = qcoeff[0];
- int16_t prev_t;
- EXTRABIT e;
- vp10_get_token_extra(v, &prev_t, &e);
- cost = (*token_costs)[0][pt][prev_t] +
- vp10_get_cost(prev_t, e, cat6_high_cost);
+ // dc token
+ int v = qcoeff[0];
+ int16_t prev_t;
+ cost = vp10_get_token_cost(v, &prev_t, cat6_high_cost);
+ cost += (*token_costs)[0][pt][prev_t];
- token_cache[0] = vp10_pt_energy_class[prev_t];
- ++token_costs;
+ token_cache[0] = vp10_pt_energy_class[prev_t];
+ ++token_costs;
- // ac tokens
- for (c = 1; c < eob; c++) {
- const int rc = scan[c];
- int16_t t;
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+ int16_t t;
- v = qcoeff[rc];
- vp10_get_token_extra(v, &t, &e);
- if (use_fast_coef_costing) {
- cost += (*token_costs)[!prev_t][!prev_t][t] +
- vp10_get_cost(t, e, cat6_high_cost);
- } else {
- pt = get_coef_context(nb, token_cache, c);
- cost += (*token_costs)[!prev_t][pt][t] +
- vp10_get_cost(t, e, cat6_high_cost);
- token_cache[rc] = vp10_pt_energy_class[t];
+ v = qcoeff[rc];
+ cost += vp10_get_token_cost(v, &t, cat6_high_cost);
+ cost += (*token_costs)[!prev_t][!prev_t][t];
+ prev_t = t;
+ if (!--band_left) {
+ band_left = *band_count++;
+ ++token_costs;
+ }
}
- prev_t = t;
- if (!--band_left) {
- band_left = *band_count++;
- ++token_costs;
- }
- }
- // eob token
- if (band_left) {
- if (use_fast_coef_costing) {
+ // eob token
+ if (band_left)
cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
- } else {
+
+ } else { // !use_fast_coef_costing
+ int band_left = *band_count++;
+
+ // dc token
+ int v = qcoeff[0];
+ int16_t tok;
+ unsigned int (*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+ cost = vp10_get_token_cost(v, &tok, cat6_high_cost);
+ cost += (*token_costs)[0][pt][tok];
+
+ token_cache[0] = vp10_pt_energy_class[tok];
+ ++token_costs;
+
+ tok_cost_ptr = &((*token_costs)[!tok]);
+
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+
+ v = qcoeff[rc];
+ cost += vp10_get_token_cost(v, &tok, cat6_high_cost);
+ pt = get_coef_context(nb, token_cache, c);
+ cost += (*tok_cost_ptr)[pt][tok];
+ token_cache[rc] = vp10_pt_energy_class[tok];
+ if (!--band_left) {
+ band_left = *band_count++;
+ ++token_costs;
+ }
+ tok_cost_ptr = &((*token_costs)[!tok]);
+ }
+
+ // eob token
+ if (band_left) {
pt = get_coef_context(nb, token_cache, c);
cost += (*token_costs)[0][pt][EOB_TOKEN];
}
@@ -658,6 +681,10 @@
plane_bsize, tx_size, &arg);
{
+#if CONFIG_VP9_HIGHBITDEPTH
+ const VP10_COMP *cpi = args->cpi;
+ const uint32_t hbd_shift = (cpi->common.bit_depth - 8) * 2;
+#endif
const int bs = 4 << tx_size;
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
@@ -674,8 +701,12 @@
const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
unsigned int tmp;
-
+#if CONFIG_VP9_HIGHBITDEPTH
+ sse = (int64_t)ROUND_POWER_OF_TWO(
+ vpx_sum_squares_2d_i16(diff, diff_stride, bs), hbd_shift) * 16;
+#else
sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, bs) * 16;
+#endif
variance(src, src_stride, dst, dst_stride, &tmp);
dist = (int64_t)tmp * 16;
}
@@ -2332,6 +2363,7 @@
#if CONFIG_VP9_HIGHBITDEPTH
DECLARE_ALIGNED(16, uint16_t, rec_buffer_alloc_16[32 * 32]);
uint8_t *rec_buffer;
+ const uint32_t hbd_shift = (cpi->common.bit_depth - 8) * 2;
#else
DECLARE_ALIGNED(16, uint8_t, rec_buffer[32 * 32]);
#endif
@@ -2372,11 +2404,21 @@
for (idy = 0; idy < blocks_height; idy += 2) {
for (idx = 0; idx < blocks_width; idx += 2) {
const int16_t *d = diff + 4 * idy * diff_stride + 4 * idx;
+#if CONFIG_VP9_HIGHBITDEPTH
+ tmp_sse += ROUND_POWER_OF_TWO(
+ vpx_sum_squares_2d_i16(d, diff_stride, 8), hbd_shift);
+#else
tmp_sse += vpx_sum_squares_2d_i16(d, diff_stride, 8);
+#endif
}
}
} else {
+#if CONFIG_VP9_HIGHBITDEPTH
+ tmp_sse = ROUND_POWER_OF_TWO(
+ vpx_sum_squares_2d_i16(diff, diff_stride, bh), hbd_shift);
+#else
tmp_sse = vpx_sum_squares_2d_i16(diff, diff_stride, bh);
+#endif
}
*bsse += (int64_t)tmp_sse * 16;
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index 0aaeb2a..5cae8e3 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -50,6 +50,35 @@
const TOKENVALUE *vp10_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
(sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
/ 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element vp10_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+ 3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531,
+ 3432, 3409, 3363, 3340, 3282, 3259, 3213, 3190,
+ 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894,
+ 2795, 2772, 2726, 2703, 2645, 2622, 2576, 2553,
+ 3197, 3116, 3058, 2977, 2881, 2800,
+ 2742, 2661, 2615, 2534, 2476, 2395,
+ 2299, 2218, 2160, 2079,
+ 2566, 2427, 2334, 2195, 2023, 1884, 1791, 1652,
+ 1893, 1696, 1453, 1256, 1229, 864,
+ 512, 512, 512, 512, 0,
+ 512, 512, 512, 512,
+ 864, 1229, 1256, 1453, 1696, 1893,
+ 1652, 1791, 1884, 2023, 2195, 2334, 2427, 2566,
+ 2079, 2160, 2218, 2299, 2395, 2476, 2534, 2615,
+ 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197,
+ 2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795,
+ 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+ 3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432,
+ 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773,
+};
+const int *vp10_dct_cat_lt_10_value_cost = dct_cat_lt_10_value_cost +
+ (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost))
+ / 2;
// Array indices are identical to previously-existing CONTEXT_NODE indices
const vpx_tree_index vp10_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
diff --git a/vp10/encoder/tokenize.h b/vp10/encoder/tokenize.h
index 12f5f1f..46b7f3f 100644
--- a/vp10/encoder/tokenize.h
+++ b/vp10/encoder/tokenize.h
@@ -76,6 +76,7 @@
*/
extern const TOKENVALUE *vp10_dct_value_tokens_ptr;
extern const TOKENVALUE *vp10_dct_cat_lt_10_value_tokens;
+extern const int *vp10_dct_cat_lt_10_value_cost;
extern const int16_t vp10_cat6_low_cost[256];
extern const int vp10_cat6_high_cost[64];
extern const int vp10_cat6_high10_high_cost[256];
@@ -119,6 +120,18 @@
return vp10_dct_cat_lt_10_value_tokens[v].token;
}
+static INLINE int vp10_get_token_cost(int v, int16_t *token,
+ const int *cat6_high_table) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+ EXTRABIT extrabits;
+ *token = CATEGORY6_TOKEN;
+ extrabits = abs(v) - CAT6_MIN_VAL;
+ return vp10_cat6_low_cost[extrabits & 0xff]
+ + cat6_high_table[extrabits >> 8];
+ }
+ *token = vp10_dct_cat_lt_10_value_tokens[v].token;
+ return vp10_dct_cat_lt_10_value_cost[v];
+}
#ifdef __cplusplus
} // extern "C"
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
index 976fe45..1cba803 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c
@@ -172,6 +172,42 @@
transpose_4x4(in);
}
+#if CONFIG_EXT_TX
+static void fdst4_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u[4], v[4];
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[3], in[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p24_p08);
+ u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+ u[2] = _mm_madd_epi16(v[0], k__cospi_p08_m24);
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[2]);
+ in[1] = _mm_packs_epi32(u[1], u[3]);
+ transpose_4x4(in);
+}
+#endif // CONFIG_EXT_TX
+
void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
__m128i in[4];
@@ -229,6 +265,48 @@
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
+ case DST_DST:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fdst4_sse2(in);
+ fdst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case DCT_DST:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fdst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case DST_DCT:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fdst4_sse2(in);
+ fdct4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case DST_ADST:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fdst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case ADST_DST:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fdst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case DST_FLIPADST:
+ load_buffer_4x4(input, in, stride, 0, 1);
+ fdst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case FLIPADST_DST:
+ load_buffer_4x4(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fdst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
#endif // CONFIG_EXT_TX
default:
assert(0);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index b1c2e11..edfd60c 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -21,7 +21,7 @@
#include "vp8/common/alloccommon.h"
#include "mcomp.h"
#include "firstpass.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
#include "vpx_scale/vpx_scale.h"
#include "vp8/common/extend.h"
#include "ratectrl.h"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 69d5fa7..713b5f7 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -16,7 +16,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/vpx_filter.h"
#if CONFIG_INTERNAL_STATS
@@ -2140,262 +2140,6 @@
#endif
}
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
-static void encoder_variance(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
- int i, j;
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h, uint64_t *sse,
- uint64_t *sum) {
- int i, j;
-
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
- a += a_stride;
- b += b_stride;
- }
-}
-
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int w, int h,
- unsigned int *sse, int *sum) {
- uint64_t sse_long = 0;
- uint64_t sum_long = 0;
- encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
- &sse_long, &sum_long);
- *sse = (unsigned int)sse_long;
- *sum = (int)sum_long;
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-static int64_t get_sse(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int width, int height) {
- const int dw = width % 16;
- const int dh = height % 16;
- int64_t total_sse = 0;
- unsigned int sse = 0;
- int sum = 0;
- int x, y;
-
- if (dw > 0) {
- encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
- dw, height, &sse, &sum);
- total_sse += sse;
- }
-
- if (dh > 0) {
- encoder_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
- total_sse += sse;
- }
-
- for (y = 0; y < height / 16; ++y) {
- const uint8_t *pa = a;
- const uint8_t *pb = b;
- for (x = 0; x < width / 16; ++x) {
- vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
- total_sse += sse;
-
- pa += 16;
- pb += 16;
- }
-
- a += 16 * a_stride;
- b += 16 * b_stride;
- }
-
- return total_sse;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- int width, int height,
- unsigned int input_shift) {
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- int64_t total_sse = 0;
- int x, y;
- for (y = 0; y < height; ++y) {
- for (x = 0; x < width; ++x) {
- int64_t diff;
- diff = (a[x] >> input_shift) - (b[x] >> input_shift);
- total_sse += diff * diff;
- }
- a += a_stride;
- b += b_stride;
- }
- return total_sse;
-}
-
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int width, int height) {
- int64_t total_sse = 0;
- int x, y;
- const int dw = width % 16;
- const int dh = height % 16;
- unsigned int sse = 0;
- int sum = 0;
- if (dw > 0) {
- encoder_highbd_8_variance(&a[width - dw], a_stride,
- &b[width - dw], b_stride,
- dw, height, &sse, &sum);
- total_sse += sse;
- }
- if (dh > 0) {
- encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
- total_sse += sse;
- }
- for (y = 0; y < height / 16; ++y) {
- const uint8_t *pa = a;
- const uint8_t *pb = b;
- for (x = 0; x < width / 16; ++x) {
- vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
- total_sse += sse;
- pa += 16;
- pb += 16;
- }
- a += 16 * a_stride;
- b += 16 * b_stride;
- }
- return total_sse;
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-typedef struct {
- double psnr[4]; // total/y/u/v
- uint64_t sse[4]; // total/y/u/v
- uint32_t samples[4]; // total/y/u/v
-} PSNR_STATS;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr,
- unsigned int bit_depth,
- unsigned int in_bit_depth) {
- const int widths[3] =
- {a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
- const int heights[3] =
- {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
- const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer };
- const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
- const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer };
- const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
- int i;
- uint64_t total_sse = 0;
- uint32_t total_samples = 0;
- const double peak = (double)((1 << in_bit_depth) - 1);
- const unsigned int input_shift = bit_depth - in_bit_depth;
-
- for (i = 0; i < 3; ++i) {
- const int w = widths[i];
- const int h = heights[i];
- const uint32_t samples = w * h;
- uint64_t sse;
- if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
- if (input_shift) {
- sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i], w, h,
- input_shift);
- } else {
- sse = highbd_get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i], w, h);
- }
- } else {
- sse = get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i],
- w, h);
- }
- psnr->sse[1 + i] = sse;
- psnr->samples[1 + i] = samples;
- psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
- total_sse += sse;
- total_samples += samples;
- }
-
- psnr->sse[0] = total_sse;
- psnr->samples[0] = total_samples;
- psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
- (double)total_sse);
-}
-
-#else // !CONFIG_VP9_HIGHBITDEPTH
-
-static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr) {
- static const double peak = 255.0;
- const int widths[3] = {
- a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
- const int heights[3] = {
- a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
- const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
- const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
- const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
- const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
- int i;
- uint64_t total_sse = 0;
- uint32_t total_samples = 0;
-
- for (i = 0; i < 3; ++i) {
- const int w = widths[i];
- const int h = heights[i];
- const uint32_t samples = w * h;
- const uint64_t sse = get_sse(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i],
- w, h);
- psnr->sse[1 + i] = sse;
- psnr->samples[1 + i] = samples;
- psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
- total_sse += sse;
- total_samples += samples;
- }
-
- psnr->sse[0] = total_sse;
- psnr->samples[0] = total_samples;
- psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
- (double)total_sse);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
static void generate_psnr_packet(VP9_COMP *cpi) {
struct vpx_codec_cx_pkt pkt;
int i;
@@ -3061,7 +2805,7 @@
vpx_clear_system_state();
- recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
if (cpi->twopass.total_left_stats.coded_error != 0.0)
fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
@@ -3571,12 +3315,12 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ kf_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
} else {
- kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
}
#else
- kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
#endif // CONFIG_VP9_HIGHBITDEPTH
// Prevent possible divide by zero error below for perfect KF
@@ -3967,13 +3711,13 @@
if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
+ cpi->ambient_err = vpx_highbd_get_y_sse(cpi->Source,
get_frame_new_buffer(cm));
} else {
- cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
}
#else
- cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
#endif // CONFIG_VP9_HIGHBITDEPTH
}
@@ -4844,28 +4588,6 @@
return;
}
-int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->y_crop_width == b->y_crop_width);
- assert(a->y_crop_height == b->y_crop_height);
-
- return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->y_crop_width == b->y_crop_width);
- assert(a->y_crop_height == b->y_crop_height);
- assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
- assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
- return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
int vp9_get_quantizer(VP9_COMP *cpi) {
return cpi->common.base_qindex;
}
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 8759cbe..017fa61 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -608,12 +608,6 @@
return get_token_alloc(tile_mb_rows, tile_mb_cols);
}
-int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b);
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
void vp9_scale_references(VP9_COMP *cpi);
void vp9_update_reference_frames(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index f6b1dfc..80ab238 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -12,7 +12,7 @@
#include <limits.h>
#include "./vpx_scale_rtcd.h"
-
+#include "vpx_dsp/psnr.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@@ -52,12 +52,12 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
} else {
- filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
}
#else
- filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+ filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
#endif // CONFIG_VP9_HIGHBITDEPTH
// Re-instate the unfiltered frame
diff --git a/vpx/internal/vpx_psnr.h b/vpx/internal/vpx_psnr.h
deleted file mode 100644
index 0e90085..0000000
--- a/vpx/internal/vpx_psnr.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_INTERNAL_VPX_PSNR_H_
-#define VPX_INTERNAL_VPX_PSNR_H_
-
-#define MAX_PSNR 100.0
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
-
-/*!\brief Converts SSE to PSNR
- *
- * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
- *
- * \param[in] samples Number of samples
- * \param[in] peak Max sample value
- * \param[in] sse Sum of squared errors
- */
-double vpx_sse_to_psnr(double samples, double peak, double sse);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_INTERNAL_VPX_PSNR_H_
diff --git a/vpx/src/vpx_psnr.c b/vpx/src/vpx_psnr.c
deleted file mode 100644
index 27a6180..0000000
--- a/vpx/src/vpx_psnr.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-
-#include "vpx/internal/vpx_psnr.h"
-
-
-double vpx_sse_to_psnr(double samples, double peak, double sse) {
- if (sse > 0.0) {
- const double psnr = 10.0 * log10(samples * peak * peak / sse);
- return psnr > MAX_PSNR ? MAX_PSNR : psnr;
- } else {
- return MAX_PSNR;
- }
-}
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index ccdef04..b77f458 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -36,10 +36,8 @@
API_SRCS-yes += src/vpx_encoder.c
API_SRCS-yes += vpx_encoder.h
API_SRCS-yes += internal/vpx_codec_internal.h
-API_SRCS-yes += internal/vpx_psnr.h
API_SRCS-yes += src/vpx_codec.c
API_SRCS-yes += src/vpx_image.c
-API_SRCS-yes += src/vpx_psnr.c
API_SRCS-yes += vpx_codec.h
API_SRCS-yes += vpx_codec.mk
API_SRCS-yes += vpx_frame_buffer.h
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index a0f59bf..402fd9a 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -2057,8 +2057,8 @@
}
}
-static void highbd_idct32_c(const tran_low_t *input,
- tran_low_t *output, int bd) {
+void vpx_highbd_idct32_c(const tran_low_t *input,
+ tran_low_t *output, int bd) {
tran_low_t step1[32], step2[32];
tran_high_t temp1, temp2;
(void) bd;
@@ -2447,7 +2447,7 @@
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
if (zero_coeff[0] | zero_coeff[1])
- highbd_idct32_c(input, outptr, bd);
+ vpx_highbd_idct32_c(input, outptr, bd);
else
memset(outptr, 0, sizeof(tran_low_t) * 32);
input += 32;
@@ -2458,7 +2458,7 @@
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
- highbd_idct32_c(temp_in, temp_out, bd);
+ vpx_highbd_idct32_c(temp_in, temp_out, bd);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2477,7 +2477,7 @@
// Rows
// Only upper-left 8x8 has non-zero coeff.
for (i = 0; i < 8; ++i) {
- highbd_idct32_c(input, outptr, bd);
+ vpx_highbd_idct32_c(input, outptr, bd);
input += 32;
outptr += 32;
}
@@ -2485,7 +2485,7 @@
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
- highbd_idct32_c(temp_in, temp_out, bd);
+ vpx_highbd_idct32_c(temp_in, temp_out, bd);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 2358813..adbb838 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -100,6 +100,7 @@
void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
new file mode 100644
index 0000000..1b92e2a
--- /dev/null
+++ b/vpx_dsp/psnr.c
@@ -0,0 +1,297 @@
+/*
+* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+* Use of this source code is governed by a BSD-style license
+* that can be found in the LICENSE file in the root of the source
+* tree. An additional intellectual property rights grant can be found
+* in the file PATENTS. All contributing project authors may
+* be found in the AUTHORS file in the root of the source tree.
+*/
+
+#include <math.h>
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_scale/yv12config.h"
+
+
+double vpx_sse_to_psnr(double samples, double peak, double sse) {
+ if (sse > 0.0) {
+ const double psnr = 10.0 * log10(samples * peak * peak / sse);
+ return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+ } else {
+ return MAX_PSNR;
+ }
+}
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+* and highbd_8_variance(). It should not.
+*/
+static void encoder_variance(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, uint64_t *sse,
+ uint64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h,
+ unsigned int *sse, int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
+ &sse_long, &sum_long);
+ *sse = (unsigned int)sse_long;
+ *sum = (int)sum_long;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int width, int height) {
+ const int dw = width % 16;
+ const int dh = height % 16;
+ int64_t total_sse = 0;
+ unsigned int sse = 0;
+ int sum = 0;
+ int x, y;
+
+ if (dw > 0) {
+ encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+ dw, height, &sse, &sum);
+ total_sse += sse;
+ }
+
+ if (dh > 0) {
+ encoder_variance(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh, &sse, &sum);
+ total_sse += sse;
+ }
+
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ for (x = 0; x < width / 16; ++x) {
+ vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ total_sse += sse;
+
+ pa += 16;
+ pb += 16;
+ }
+
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+
+ return total_sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int width, int height,
+ unsigned int input_shift) {
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ int64_t total_sse = 0;
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ int64_t diff;
+ diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+ total_sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+ return total_sse;
+}
+
+int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int width, int height) {
+ int64_t total_sse = 0;
+ int x, y;
+ const int dw = width % 16;
+ const int dh = height % 16;
+ unsigned int sse = 0;
+ int sum = 0;
+ if (dw > 0) {
+ encoder_highbd_8_variance(&a[width - dw], a_stride,
+ &b[width - dw], b_stride,
+ dw, height, &sse, &sum);
+ total_sse += sse;
+ }
+ if (dh > 0) {
+ encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh, &sse, &sum);
+ total_sse += sse;
+ }
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ for (x = 0; x < width / 16; ++x) {
+ vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ total_sse += sse;
+ pa += 16;
+ pb += 16;
+ }
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+ return total_sse;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+
+ return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+ return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr,
+ unsigned int bit_depth,
+ unsigned int in_bit_depth) {
+ const int widths[3] =
+ { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] =
+ { a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+ const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+ const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+ const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+ const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+ const double peak = (double)((1 << in_bit_depth) - 1);
+ const unsigned int input_shift = bit_depth - in_bit_depth;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ uint64_t sse;
+ if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (input_shift) {
+ sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+ b_planes[i], b_strides[i], w, h,
+ input_shift);
+ } else {
+ sse = highbd_get_sse(a_planes[i], a_strides[i],
+ b_planes[i], b_strides[i], w, h);
+ }
+ } else {
+ sse = get_sse(a_planes[i], a_strides[i],
+ b_planes[i], b_strides[i],
+ w, h);
+ }
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+ (double)total_sse);
+}
+
+#endif // !CONFIG_VP9_HIGHBITDEPTH
+
+void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr) {
+ static const double peak = 255.0;
+ const int widths[3] = {
+ a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = {
+ a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+ const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+ const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+ const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+ const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+ b_planes[i], b_strides[i],
+ w, h);
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+ (double)total_sse);
+}
diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h
new file mode 100644
index 0000000..c8da94f
--- /dev/null
+++ b/vpx_dsp/psnr.h
@@ -0,0 +1,65 @@
+/*
+* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+* Use of this source code is governed by a BSD-style license
+* that can be found in the LICENSE file in the root of the source
+* tree. An additional intellectual property rights grant can be found
+* in the file PATENTS. All contributing project authors may
+* be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_PSNR_H_
+#define VPX_DSP_PSNR_H_
+
+
+#include "vpx_scale/yv12config.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ double psnr[4]; // total/y/u/v
+ uint64_t sse[4]; // total/y/u/v
+ uint32_t samples[4]; // total/y/u/v
+} PSNR_STATS;
+
+// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
+
+/*!\brief Converts SSE to PSNR
+*
+* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+*
+* \param[in] samples Number of samples
+* \param[in] peak Max sample value
+* \param[in] sse Sum of squared errors
+*/
+double vpx_sse_to_psnr(double samples, double peak, double sse);
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr,
+ unsigned int bit_depth,
+ unsigned int in_bit_depth);
+int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int width, int height,
+ unsigned int input_shift);
+#endif
+void calc_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr);
+
+int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int width, int height);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // VPX_DSP_PSNR_H_
diff --git a/vpx_dsp/psnrhvs.c b/vpx_dsp/psnrhvs.c
index 4d3d6ee..9b70c6a 100644
--- a/vpx_dsp/psnrhvs.c
+++ b/vpx_dsp/psnrhvs.c
@@ -19,7 +19,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/ssim.h"
#include "vpx_ports/system_state.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
#if !defined(M_PI)
# define M_PI (3.141592653589793238462643)
diff --git a/vpx_dsp/txfm_common.h b/vpx_dsp/txfm_common.h
index 442e6a5..9b0e990 100644
--- a/vpx_dsp/txfm_common.h
+++ b/vpx_dsp/txfm_common.h
@@ -57,10 +57,13 @@
static const tran_high_t cospi_30_64 = 1606;
static const tran_high_t cospi_31_64 = 804;
-// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
static const tran_high_t sinpi_1_9 = 5283;
static const tran_high_t sinpi_2_9 = 9929;
static const tran_high_t sinpi_3_9 = 13377;
static const tran_high_t sinpi_4_9 = 15212;
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+
#endif // VPX_DSP_TXFM_COMMON_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a44f948..dbb41aa 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -22,6 +22,8 @@
DSP_SRCS-yes += bitwriter.c
DSP_SRCS-yes += bitwriter_buffer.c
DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-yes += psnr.c
+DSP_SRCS-yes += psnr.h
DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c