Merge "vp9_ethread: add parallel loopfilter"
diff --git a/configure b/configure
index 4509697..f5dd7fc 100755
--- a/configure
+++ b/configure
@@ -58,6 +58,7 @@
${toggle_postproc_visualizer} macro block / block level visualizers
${toggle_multi_res_encoding} enable multiple-resolution encoding
${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser
+ ${toggle_vp9_highbitdepth} enable 10/12 bit support in VP9
${toggle_vp9_temporal_denoising}
enable vp9 temporal denoising
${toggle_webm_io} enable input from and output to WebM container
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
index b9da8c9..b1ad837 100644
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -50,9 +50,10 @@
return vget_lane_s32(c, 0);
}
+// w * h must be less than 2048 or local variable v_sum may overflow.
static void variance_neon_w8(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
+ int w, int h, uint32_t *sse, int *sum) {
int i, j;
int16x8_t v_sum = vdupq_n_s16(0);
int32x4_t v_sse_lo = vdupq_n_s32(0);
@@ -215,25 +216,56 @@
unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight64, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 11); // >> 11 = / 32 * 64
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1);
+ variance_neon_w8(a + (kHeight32 * a_stride), a_stride,
+ b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32,
+ &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
}
unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight32, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 11); // >> 11 = / 64 * 32
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
+ variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
+ b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
+ &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
}
unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight64, sse, &sum);
- return *sse - (((int64_t)sum * sum) >> 12); // >> 12 = / 64 * 64
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
+ variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
+ b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
+ &sse2, &sum2);
+ sse1 += sse2;
+ sum1 += sum2;
+
+ variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride,
+ b + (kHeight16 * 2 * b_stride), b_stride,
+ kWidth64, kHeight16, &sse2, &sum2);
+ sse1 += sse2;
+ sum1 += sum2;
+
+ variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride,
+ b + (kHeight16 * 3 * b_stride), b_stride,
+ kWidth64, kHeight16, &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
}
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index ab85337..a7aaff0 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -352,6 +352,7 @@
int r;
const uint8_t *srcbuf = src.y_buffer;
uint8_t *destbuf = dest.y_buffer;
+
assert(dest.y_width == src.y_width);
assert(dest.y_height == src.y_height);
@@ -362,13 +363,13 @@
}
}
-static void swap_frame_buffer(YV12_BUFFER_CONFIG dest,
- YV12_BUFFER_CONFIG src) {
- uint8_t *tmp_buf = dest.y_buffer;
- assert(dest.y_width == src.y_width);
- assert(dest.y_height == src.y_height);
- dest.y_buffer = src.y_buffer;
- src.y_buffer = tmp_buf;
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
+ YV12_BUFFER_CONFIG *src) {
+ uint8_t *tmp_buf = dest->y_buffer;
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+ dest->y_buffer = src->y_buffer;
+ src->y_buffer = tmp_buf;
}
void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
@@ -387,16 +388,16 @@
/* For non key frames */
if (refresh_alt_ref_frame) {
- swap_frame_buffer(denoiser->running_avg_y[ALTREF_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
+ swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+ &denoiser->running_avg_y[INTRA_FRAME]);
}
if (refresh_golden_frame) {
- swap_frame_buffer(denoiser->running_avg_y[GOLDEN_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
+ swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+ &denoiser->running_avg_y[INTRA_FRAME]);
}
if (refresh_last_frame) {
- swap_frame_buffer(denoiser->running_avg_y[LAST_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
+ swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+ &denoiser->running_avg_y[INTRA_FRAME]);
}
}
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index ef6174e..82bce37 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -344,7 +344,8 @@
buf = vp9_lookahead_peek(cpi->lookahead, 0);
if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
- lc->rc.frames_to_key != 0 && !(buf->flags & VPX_EFLAG_FORCE_KF)) {
+ lc->rc.frames_to_key != 0 &&
+ !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
if ((cpi->svc.number_temporal_layers > 1 &&
cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
(cpi->svc.number_spatial_layers > 1 &&