Merge "vpxenc: Warn on lagged encoding with real time."
diff --git a/ivfdec.c b/ivfdec.c
index a37a44c..c7f4a89 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -13,6 +13,25 @@
#include <stdio.h>
#include <stdlib.h>
+static void fix_framerate(int *num, int *den) {
+ // Some versions of vpxenc used 1/(2*fps) for the timebase, so
+ // we can guess the framerate using only the timebase in this
+ // case. Other files would require reading ahead to guess the
+ // timebase, like we do for webm.
+ if (*num < 1000) {
+ // Correct for the factor of 2 applied to the timebase in the encoder.
+ if (*num & 1)
+ *den *= 2;
+ else
+ *num /= 2;
+ } else {
+ // Don't know FPS for sure, and don't have readahead code
+ // (yet?), so just default to 30fps.
+ *num = 30;
+ *den = 1;
+ }
+}
+
int file_is_ivf(struct VpxInputContext *input_ctx) {
char raw_hdr[32];
int is_ivf = 0;
@@ -32,27 +51,8 @@
input_ctx->height = mem_get_le16(raw_hdr + 14);
input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
-
- /* Some versions of vpxenc used 1/(2*fps) for the timebase, so
- * we can guess the framerate using only the timebase in this
- * case. Other files would require reading ahead to guess the
- * timebase, like we do for webm.
- */
- if (input_ctx->framerate.numerator < 1000) {
- /* Correct for the factor of 2 applied to the timebase in the
- * encoder.
- */
- if (input_ctx->framerate.numerator & 1)
- input_ctx->framerate.denominator <<= 1;
- else
- input_ctx->framerate.numerator >>= 1;
- } else {
- /* Don't know FPS for sure, and don't have readahead code
- * (yet?), so just default to 30fps.
- */
- input_ctx->framerate.numerator = 30;
- input_ctx->framerate.denominator = 1;
- }
+ fix_framerate(&input_ctx->framerate.numerator,
+ &input_ctx->framerate.denominator);
}
}
@@ -65,16 +65,10 @@
return is_ivf;
}
-int ivf_read_frame(struct VpxInputContext *input_ctx,
- uint8_t **buffer,
- size_t *bytes_read,
- size_t *buffer_size) {
+int ivf_read_frame(FILE *infile, uint8_t **buffer,
+ size_t *bytes_read, size_t *buffer_size) {
char raw_header[IVF_FRAME_HDR_SZ] = {0};
size_t frame_size = 0;
- FILE *infile = input_ctx->file;
-
- if (input_ctx->file_type != FILE_TYPE_IVF)
- return 0;
if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
if (!feof(infile))
diff --git a/ivfdec.h b/ivfdec.h
index 5da9acc..dd29cc6 100644
--- a/ivfdec.h
+++ b/ivfdec.h
@@ -18,10 +18,8 @@
int file_is_ivf(struct VpxInputContext *input);
-int ivf_read_frame(struct VpxInputContext *input,
- uint8_t **buffer,
- size_t *bytes_read,
- size_t *buffer_size);
+int ivf_read_frame(FILE *infile, uint8_t **buffer,
+ size_t *bytes_read, size_t *buffer_size);
#ifdef __cplusplus
} /* extern "C" */
diff --git a/ivfenc.c b/ivfenc.c
index fa92566..0041ff0 100644
--- a/ivfenc.c
+++ b/ivfenc.c
@@ -20,9 +20,6 @@
int frame_cnt) {
char header[32];
- if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
- return;
-
header[0] = 'D';
header[1] = 'K';
header[2] = 'I';
@@ -44,9 +41,6 @@
char header[12];
vpx_codec_pts_t pts;
- if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
- return;
-
pts = pkt->data.frame.pts;
mem_put_le32(header, (int)pkt->data.frame.sz);
mem_put_le32(header + 4, pts & 0xFFFFFFFF);
diff --git a/tools_common.h b/tools_common.h
index 7500523..1d70ab5 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -63,10 +63,8 @@
#define RAW_FRAME_HDR_SZ sizeof(uint32_t)
-#define VP8_FOURCC (0x30385056)
-#define VP9_FOURCC (0x30395056)
-#define VP8_FOURCC_MASK (0x00385056)
-#define VP9_FOURCC_MASK (0x00395056)
+#define VP8_FOURCC 0x30385056
+#define VP9_FOURCC 0x30395056
enum VideoFileType {
FILE_TYPE_RAW,
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index b457604..45d7984 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -43,8 +43,8 @@
typedef enum {
- USAGE_STREAM_FROM_SERVER = 0x0,
- USAGE_LOCAL_FILE_PLAYBACK = 0x1,
+ USAGE_LOCAL_FILE_PLAYBACK = 0x0,
+ USAGE_STREAM_FROM_SERVER = 0x1,
USAGE_CONSTRAINED_QUALITY = 0x2,
USAGE_CONSTANT_QUALITY = 0x3,
} END_USAGE;
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 212a28a..a172ba6 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -13,13 +13,16 @@
#include <stdio.h>
#include "./vpx_config.h"
-#include "vpx_scale/yv12config.h"
-#include "vp9/common/vp9_postproc.h"
-#include "vp9/common/vp9_textblit.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "./vp9_rtcd.h"
#include "./vpx_scale_rtcd.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_textblit.h"
#define RGB_TO_YUV(t) \
( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \
@@ -127,9 +130,6 @@
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
};
-
-/****************************************************************************
- */
void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
uint8_t *dst_ptr,
int src_pixels_per_line,
@@ -371,7 +371,7 @@
}
}
-double vp9_gaussian(double sigma, double mu, double x) {
+static double gaussian(double sigma, double mu, double x) {
return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
(exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
}
@@ -396,7 +396,7 @@
next = 0;
for (i = -32; i < 32; i++) {
- int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));
+ int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
if (a) {
for (j = 0; j < a; j++) {
@@ -425,27 +425,6 @@
state->last_noise = a;
}
-/****************************************************************************
- *
- * ROUTINE : plane_add_noise_c
- *
- * INPUTS : unsigned char *Start starting address of buffer to
- * add gaussian noise to
- * unsigned int width width of plane
- * unsigned int height height of plane
- * int pitch distance between subsequent lines of frame
- * int q quantizer used to determine amount of noise
- * to add
- *
- * OUTPUTS : None.
- *
- * RETURNS : void.
- *
- * FUNCTION : adds gaussian noise to a plane of pixels
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
void vp9_plane_add_noise_c(uint8_t *start, char *noise,
char blackclamp[16],
char whiteclamp[16],
@@ -628,49 +607,40 @@
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
- int q = cm->lf.filter_level * 10 / 6;
- int flags = ppflags->post_proc_flag;
- int deblock_level = ppflags->deblocking_level;
- int noise_level = ppflags->noise_level;
+ const int q = MIN(63, cm->lf.filter_level * 10 / 6);
+ const int flags = ppflags->post_proc_flag;
+ YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
+ struct postproc_state *const ppstate = &cm->postproc_state;
if (!cm->frame_to_show)
return -1;
- if (q > 63)
- q = 63;
-
if (!flags) {
*dest = *cm->frame_to_show;
return 0;
}
-#if ARCH_X86||ARCH_X86_64
- vpx_reset_mmx_state();
-#endif
+ vp9_clear_system_state();
if (flags & VP9D_DEMACROBLOCK) {
- deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
- q + (deblock_level - 5) * 10, 1, 0);
+ deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
+ q + (ppflags->deblocking_level - 5) * 10, 1, 0);
} else if (flags & VP9D_DEBLOCK) {
- vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
+ vp9_deblock(cm->frame_to_show, ppbuf, q);
} else {
- vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
+ vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
}
if (flags & VP9D_ADDNOISE) {
- if (cm->postproc_state.last_q != q
- || cm->postproc_state.last_noise != noise_level) {
- fillrd(&cm->postproc_state, 63 - q, noise_level);
+ const int noise_level = ppflags->noise_level;
+ if (ppstate->last_q != q ||
+ ppstate->last_noise != noise_level) {
+ fillrd(ppstate, 63 - q, noise_level);
}
- vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
- cm->postproc_state.noise,
- cm->postproc_state.blackclamp,
- cm->postproc_state.whiteclamp,
- cm->postproc_state.bothclamp,
- cm->post_proc_buffer.y_width,
- cm->post_proc_buffer.y_height,
- cm->post_proc_buffer.y_stride);
+ vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+ ppstate->whiteclamp, ppstate->bothclamp,
+ ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
}
#if 0 && CONFIG_POSTPROC_VISUALIZER
@@ -684,16 +654,14 @@
cm->filter_level,
flags,
cm->mb_cols, cm->mb_rows);
- vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
- cm->post_proc_buffer.y_stride);
+ vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride);
}
if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
int i, j;
uint8_t *y_ptr;
- YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
- int mb_rows = post->y_height >> 4;
- int mb_cols = post->y_width >> 4;
+ int mb_rows = ppbuf->y_height >> 4;
+ int mb_cols = ppbuf->y_width >> 4;
int mb_index = 0;
MODE_INFO *mi = cm->mi;
@@ -719,9 +687,8 @@
if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
int i, j;
uint8_t *y_ptr;
- YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
- int mb_rows = post->y_height >> 4;
- int mb_cols = post->y_width >> 4;
+ int mb_rows = ppbuf->y_height >> 4;
+ int mb_cols = ppbuf->y_width >> 4;
int mb_index = 0;
MODE_INFO *mi = cm->mi;
@@ -755,17 +722,15 @@
snprintf(message, sizeof(message),
"Bitrate: %10.2f framerate: %10.2f ",
cm->bitrate, cm->framerate);
- vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
- cm->post_proc_buffer.y_stride);
+ vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride);
}
/* Draw motion vectors */
if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
- YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
- int y_stride = cm->post_proc_buffer.y_stride;
+ int width = ppbuf->y_width;
+ int height = ppbuf->y_height;
+ uint8_t *y_buffer = ppbuf->y_buffer;
+ int y_stride = ppbuf->y_stride;
MODE_INFO *mi = cm->mi;
int x0, y0;
@@ -904,13 +869,12 @@
if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
&& (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
int y, x;
- YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
- uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
- uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
- int y_stride = cm->post_proc_buffer.y_stride;
+ int width = ppbuf->y_width;
+ int height = ppbuf->y_height;
+ uint8_t *y_ptr = ppbuf->y_buffer;
+ uint8_t *u_ptr = ppbuf->u_buffer;
+ uint8_t *v_ptr = ppbuf->v_buffer;
+ int y_stride = ppbuf->y_stride;
MODE_INFO *mi = cm->mi;
for (y = 0; y < height; y += 16) {
@@ -969,13 +933,12 @@
if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
ppflags->display_ref_frame_flag) {
int y, x;
- YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
- uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
- uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
- int y_stride = cm->post_proc_buffer.y_stride;
+ int width = ppbuf->y_width;
+ int height = ppbuf->y_height;
+ uint8_t *y_ptr = ppbuf->y_buffer;
+ uint8_t *u_ptr = ppbuf->u_buffer;
+ uint8_t *v_ptr = ppbuf->v_buffer;
+ int y_stride = ppbuf->y_stride;
MODE_INFO *mi = cm->mi;
for (y = 0; y < height; y += 16) {
@@ -1002,7 +965,7 @@
}
#endif
- *dest = cm->post_proc_buffer;
+ *dest = *ppbuf;
/* handle problem with extending borders */
dest->y_width = cm->width;
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index c63beae..b8a456f 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -13,6 +13,7 @@
#define VP9_COMMON_VP9_POSTPROC_H_
#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_ppflags.h"
struct postproc_state {
int last_q;
@@ -23,8 +24,7 @@
DECLARE_ALIGNED(16, char, bothclamp[16]);
};
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_ppflags.h"
+struct VP9Common;
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index 6148206..7a790c5 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -39,17 +39,12 @@
typedef const vp9_tree_index vp9_tree[];
-/* Convert array of token occurrence counts into a table of probabilities
- for the associated binary encoding tree. Also writes count of branches
- taken for each node on the tree; this facilitiates decisions as to
- probability updates. */
-
static INLINE vp9_prob clip_prob(int p) {
return (p > 255) ? 255u : (p < 1) ? 1u : p;
}
// int64 is not needed for normal frame level calculations.
-// However when outputing entropy stats accumulated over many frames
+// However when outputting entropy stats accumulated over many frames
// or even clips we can overflow int math.
#ifdef ENTROPY_STATS
static INLINE vp9_prob get_prob(int num, int den) {
@@ -65,7 +60,7 @@
return get_prob(n0, n0 + n1);
}
-/* this function assumes prob1 and prob2 are already within [1,255] range */
+/* This function assumes prob1 and prob2 are already within [1,255] range. */
static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
}
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f954236..f4f7582 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -23,19 +23,20 @@
const short *filter
);
-#if HAVE_SSSE3
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#if (HAVE_SSSE3)
filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
@@ -45,6 +46,90 @@
/* Ensure the filter can be compressed to int16_t. */
if (x_step_q4 == 16 && filter_x[3] != 128) {
while (w >= 16) {
+ vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+#else
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Ensure the filter can be compressed to int16_t. */
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
vp9_filter_block1d16_h8_ssse3(src, src_stride,
dst, dst_stride,
h, filter_x);
@@ -113,6 +198,7 @@
w, h);
}
}
+#endif
void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000..303fced
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, const unsigned char,
+filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
+
+DECLARE_ALIGNED(16, const unsigned char,
+filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, const unsigned char,
+filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
+
+DECLARE_ALIGNED(16, const unsigned char,
+filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
+
+DECLARE_ALIGNED(16, const unsigned char,
+filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
+
+DECLARE_ALIGNED(16, const unsigned char,
+filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
+
+
+
+void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ __m128i addFilterReg64, filtersReg, srcReg, minReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter into the first lane
+ firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+ // duplicate only the third 16 bit in the filter into the first lane
+ secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+ // duplicate only the seconds 16 bits in the filter into the second lane
+ firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+ // duplicate only the forth 16 bits in the filter into the second lane
+ secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+ // loading the local filters
+ thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
+ forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+ // filter the source buffer
+ srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // extract the higher half of the lane
+ srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
+ srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+
+ minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+ // add and saturate all the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+ srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr+=src_pixels_per_line;
+
+ // save only 4 bytes
+ *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+
+void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+ __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ __m128i addFilterReg64, filtersReg, minReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 128 bit register
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 128 bit register
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+ filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+ filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+ filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+ // filter the source buffer
+ srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+ srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+ srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+ // add and saturate all the results together
+ minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+ srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr+=src_pixels_per_line;
+
+ // save only 8 bytes
+ _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+ __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 128 bit register
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 128 bit register
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+ filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+ filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+ filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+ // filter the source buffer
+ srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+ // reading the next 16 bytes.
+ // (part of it was being read by earlier read)
+ srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ // filter the source buffer
+ srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+ src_ptr+=src_pixels_per_line;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+
+
+void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, firstFilters, secondFilters;
+ __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter into the first lane
+ firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+ // duplicate only the second 16 bits in the filter into the second lane
+ firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+ // duplicate only the third 16 bits in the filter into the first lane
+ secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+ // duplicate only the forth 16 bits in the filter into the second lane
+ secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+ for (i = 0; i < output_height; i++) {
+ // load the first 4 byte
+ srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0]));
+ // load the next 4 bytes in stride of src_pitch
+ srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0]));
+
+ // merge the result together
+ srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+
+
+ srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0]));
+ srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0]));
+
+ // merge the result together
+ srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+
+ srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0]));
+ srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0]));
+
+ // merge the result together
+ srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+ srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2);
+
+ srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0]));
+ srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0]));
+
+ // merge the result together
+ srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2);
+ srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+
+ // extract the second lane of the 128 bit register
+ srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
+
+ // add and saturate the results together
+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_srli_si128(srcRegFilt3, 8));
+ srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr+=src_pitch;
+
+ // save only 4 bytes convolve result
+ *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+ output_ptr+=out_pitch;
+ }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits in the filter
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits in the filter
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits in the filter
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ for (i = 0; i < output_height; i++) {
+ // load the first 8 bytes
+ srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+ // load the next 8 bytes in stride of src_pitch
+ srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
+ srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
+ srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+
+ // merge the result together
+ srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+
+ // load the next 8 bytes in stride of src_pitch
+ srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
+ srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
+ srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
+ srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+
+ // merge the result together
+ srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+ srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+ // add and saturate the results together
+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+ srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr+=src_pitch;
+
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+ output_ptr+=out_pitch;
+ }
+}
+
+
+void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits in the filter
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits in the filter
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits in the filter
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+
+ for (i = 0; i < output_height; i++) {
+ // load the first 16 bytes
+ srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
+ // load the next 16 bytes in stride of src_pitch
+ srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
+ srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
+ srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+ // merge the result together
+ srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+ srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+ srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
+ srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+ srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+ // load the next 16 bytes in stride of two/three src_pitch
+ srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
+ srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
+
+ // merge the result together
+ srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+ srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+ // load the next 16 bytes in stride of four/five src_pitch
+ srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
+ srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
+
+ // merge the result together
+ srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+ srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+ _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+ _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+ src_ptr+=src_pitch;
+
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+ output_ptr+=out_pitch;
+ }
+}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 69c569d..ec4dc14 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -39,11 +39,7 @@
#endif
#ifdef ENTROPY_STATS
-int intra_mode_stats[INTRA_MODES]
- [INTRA_MODES]
- [INTRA_MODES];
vp9_coeff_stats tree_update_hist[TX_SIZES][PLANE_TYPES];
-
extern unsigned int active_section;
#endif
@@ -414,9 +410,6 @@
const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i);
const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, i);
const int bm = m->bmi[i].as_mode;
-#ifdef ENTROPY_STATS
- ++intra_mode_stats[A][L][bm];
-#endif
write_intra_mode(bc, bm, vp9_kf_y_mode_prob[A][L]);
}
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 4c66c20..b0fae65 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2543,6 +2543,7 @@
}
if (!is_inter_block(mbmi)) {
+ mbmi->skip_coeff = 1;
vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8));
vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8));
if (output_enabled)
@@ -2561,6 +2562,7 @@
if (!is_inter_block(mbmi)) {
vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
} else if (!x->skip) {
+ mbmi->skip_coeff = 1;
vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
} else {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 21bc588..4bef675 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -451,6 +451,9 @@
ctx->tl[plane][j] = p->eobs[block] > 0;
}
+ if (p->eobs[block])
+ *(args->skip_coeff) = 0;
+
if (x->skip_encode || p->eobs[block] == 0)
return;
@@ -474,7 +477,6 @@
assert(0 && "Invalid transform size");
}
}
-
static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct encode_b_args *const args = arg;
@@ -499,7 +501,8 @@
void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {x, &ctx};
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
vp9_subtract_sby(x, bsize);
if (x->optimize)
@@ -511,7 +514,8 @@
void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {x, &ctx};
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
if (!x->skip_recode)
vp9_subtract_sb(x, bsize);
@@ -655,12 +659,15 @@
default:
assert(0);
}
+ if (*eob)
+ *(args->skip_coeff) = 0;
}
void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {x, &ctx};
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
&arg);
@@ -668,7 +675,8 @@
void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {x, &ctx};
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
}
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index cb872a7..207d573 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -24,6 +24,7 @@
struct encode_b_args {
MACROBLOCK *x;
struct optimize_ctx *ctx;
+ unsigned char *skip_coeff;
};
void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 9dfb442..538599d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -49,9 +49,6 @@
#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
-#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
-#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
-
static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
YV12_BUFFER_CONFIG temp = *a;
*a = *b;
@@ -269,20 +266,15 @@
// harder frames.
static double calculate_modified_err(VP9_COMP *cpi,
FIRSTPASS_STATS *this_frame) {
- const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
+ struct twopass_rc *const twopass = &cpi->twopass;
+ const FIRSTPASS_STATS *const stats = &twopass->total_stats;
const double av_err = stats->ssim_weighted_pred_err / stats->count;
- const double this_err = this_frame->ssim_weighted_pred_err;
- double modified_error;
+ double modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ cpi->oxcf.two_pass_vbrbias / 100.0);
- modified_error = av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
- this_err > av_err ? POW1 : POW2);
-
- if (modified_error < cpi->twopass.modified_error_min)
- modified_error = cpi->twopass.modified_error_min;
- else if (modified_error > cpi->twopass.modified_error_max)
- modified_error = cpi->twopass.modified_error_max;
-
- return modified_error;
+ return fclamp(modified_error,
+ twopass->modified_error_min, twopass->modified_error_max);
}
static const double weight_table[256] = {
@@ -353,13 +345,14 @@
// This function returns the maximum target rate per frame.
static int frame_max_bits(VP9_COMP *cpi) {
int64_t max_bits =
- ((int64_t)cpi->rc.av_per_frame_bandwidth *
- (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+ ((int64_t)cpi->rc.av_per_frame_bandwidth *
+ (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
if (max_bits < 0)
- return 0;
- if (max_bits >= INT_MAX)
- return INT_MAX;
+ max_bits = 0;
+ else if (max_bits > cpi->rc.max_frame_bandwidth)
+ max_bits = cpi->rc.max_frame_bandwidth;
+
return (int)max_bits;
}
@@ -953,13 +946,13 @@
int q;
int num_mbs = cpi->common.MBs;
int target_norm_bits_per_mb;
+ RATE_CONTROL *const rc = &cpi->rc;
- double section_err = fpstats->coded_error / fpstats->count;
- double err_per_mb = section_err / num_mbs;
- double err_correction_factor;
+ const double section_err = fpstats->coded_error / fpstats->count;
+ const double err_per_mb = section_err / num_mbs;
if (section_target_bandwitdh <= 0)
- return cpi->rc.worst_quality; // Highest value allowed
+ return rc->worst_quality; // Highest value allowed
target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
? (512 * section_target_bandwitdh) / num_mbs
@@ -967,15 +960,11 @@
// Try and pick a max Q that will be high enough to encode the
// content at the given rate.
- for (q = cpi->rc.best_quality; q < cpi->rc.worst_quality; q++) {
- int bits_per_mb_at_this_q;
-
- err_correction_factor = calc_correction_factor(err_per_mb,
- ERR_DIVISOR, 0.5, 0.90, q);
-
- bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
- err_correction_factor);
-
+ for (q = rc->best_quality; q < rc->worst_quality; q++) {
+ const double err_correction_factor = calc_correction_factor(err_per_mb,
+ ERR_DIVISOR, 0.5, 0.90, q);
+ const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
+ err_correction_factor);
if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
break;
}
@@ -1169,8 +1158,7 @@
if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
break;
- zz_inter =
- (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
+ zz_inter = (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
if (zz_inter < 0.999)
break;
}
@@ -1540,6 +1528,7 @@
int b_boost = 0;
int flash_detected;
int active_max_gf_interval;
+ RATE_CONTROL *const rc = &cpi->rc;
cpi->twopass.gf_group_bits = 0;
@@ -1556,7 +1545,7 @@
// If this is a key frame or the overlay from a previous arf then
// The error score / cost of this frame has already been accounted for.
- if (cpi->common.frame_type == KEY_FRAME || cpi->rc.source_alt_ref_active)
+ if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
gf_group_err -= gf_first_frame_err;
// Motion breakout threshold for loop below depends on image size.
@@ -1570,14 +1559,14 @@
// interval to spread the cost of the GF.
//
active_max_gf_interval =
- 12 + ((int)vp9_convert_qindex_to_q(cpi->rc.last_q[INTER_FRAME]) >> 5);
+ 12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5);
- if (active_max_gf_interval > cpi->rc.max_gf_interval)
- active_max_gf_interval = cpi->rc.max_gf_interval;
+ if (active_max_gf_interval > rc->max_gf_interval)
+ active_max_gf_interval = rc->max_gf_interval;
i = 0;
while ((i < cpi->twopass.static_scene_max_gf_interval) &&
- (i < cpi->rc.frames_to_key)) {
+ (i < rc->frames_to_key)) {
i++; // Increment the loop counter
// Accumulate error score of frames in this gf group
@@ -1620,8 +1609,7 @@
}
// Calculate a boost number for this frame
- boost_score +=
- (decay_accumulator *
+ boost_score += (decay_accumulator *
calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
// Break out conditions.
@@ -1649,14 +1637,14 @@
cpi->twopass.gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
// Don't allow a gf too near the next kf
- if ((cpi->rc.frames_to_key - i) < MIN_GF_INTERVAL) {
- while (i < (cpi->rc.frames_to_key + !cpi->rc.next_key_frame_forced)) {
+ if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
+ while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
i++;
if (EOF == input_stats(&cpi->twopass, this_frame))
break;
- if (i < cpi->rc.frames_to_key) {
+ if (i < rc->frames_to_key) {
mod_frame_err = calculate_modified_err(cpi, this_frame);
gf_group_err += mod_frame_err;
}
@@ -1676,18 +1664,18 @@
#endif
// Set the interval until the next gf.
- if (cpi->common.frame_type == KEY_FRAME || cpi->rc.source_alt_ref_active)
- cpi->rc.baseline_gf_interval = i - 1;
+ if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+ rc->baseline_gf_interval = i - 1;
else
- cpi->rc.baseline_gf_interval = i;
+ rc->baseline_gf_interval = i;
// Should we use the alternate reference frame
if (allow_alt_ref &&
(i < cpi->oxcf.lag_in_frames) &&
(i >= MIN_GF_INTERVAL) &&
// for real scene cuts (not forced kfs) dont allow arf very near kf.
- (cpi->rc.next_key_frame_forced ||
- (i <= (cpi->rc.frames_to_key - MIN_GF_INTERVAL))) &&
+ (rc->next_key_frame_forced ||
+ (i <= (rc->frames_to_key - MIN_GF_INTERVAL))) &&
((next_frame.pcnt_inter > 0.75) ||
(next_frame.pcnt_second_ref > 0.5)) &&
((mv_in_out_accumulator / (double)i > -0.2) ||
@@ -1695,25 +1683,25 @@
(boost_score > 100)) {
// Alternative boost calculation for alt ref
- cpi->rc.gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
- &b_boost);
- cpi->rc.source_alt_ref_pending = 1;
+ rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+ &b_boost);
+ rc->source_alt_ref_pending = 1;
#if CONFIG_MULTIPLE_ARF
// Set the ARF schedule.
if (cpi->multi_arf_enabled) {
- schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
+ schedule_frames(cpi, 0, -(rc->baseline_gf_interval - 1), 2, 1, 0);
}
#endif
} else {
- cpi->rc.gfu_boost = (int)boost_score;
- cpi->rc.source_alt_ref_pending = 0;
+ rc->gfu_boost = (int)boost_score;
+ rc->source_alt_ref_pending = 0;
#if CONFIG_MULTIPLE_ARF
// Set the GF schedule.
if (cpi->multi_arf_enabled) {
- schedule_frames(cpi, 0, cpi->rc.baseline_gf_interval - 1, 2, 0, 0);
+ schedule_frames(cpi, 0, rc->baseline_gf_interval - 1, 2, 0, 0);
assert(cpi->new_frame_coding_order_period ==
- cpi->rc.baseline_gf_interval);
+ rc->baseline_gf_interval);
}
#endif
}
@@ -1773,32 +1761,28 @@
// Clip cpi->twopass.gf_group_bits based on user supplied data rate
// variability limit (cpi->oxcf.two_pass_vbrmax_section)
- if (cpi->twopass.gf_group_bits >
- (int64_t)max_bits * cpi->rc.baseline_gf_interval)
- cpi->twopass.gf_group_bits =
- (int64_t)max_bits * cpi->rc.baseline_gf_interval;
+ if (cpi->twopass.gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+ cpi->twopass.gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
// Reset the file position
reset_fpf_position(&cpi->twopass, start_pos);
// Assign bits to the arf or gf.
- for (i = 0;
- i <= (cpi->rc.source_alt_ref_pending &&
- cpi->common.frame_type != KEY_FRAME);
- ++i) {
+ for (i = 0; i <= (rc->source_alt_ref_pending &&
+ cpi->common.frame_type != KEY_FRAME); ++i) {
int allocation_chunks;
- int q = cpi->rc.last_q[INTER_FRAME];
+ int q = rc->last_q[INTER_FRAME];
int gf_bits;
- int boost = (cpi->rc.gfu_boost * gfboost_qadjust(q)) / 100;
+ int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
// Set max and minimum boost and hence minimum allocation
- boost = clamp(boost, 125, (cpi->rc.baseline_gf_interval + 1) * 200);
+ boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
- if (cpi->rc.source_alt_ref_pending && i == 0)
- allocation_chunks = ((cpi->rc.baseline_gf_interval + 1) * 100) + boost;
+ if (rc->source_alt_ref_pending && i == 0)
+ allocation_chunks = ((rc->baseline_gf_interval + 1) * 100) + boost;
else
- allocation_chunks = (cpi->rc.baseline_gf_interval * 100) + (boost - 100);
+ allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100);
// Prevent overflow
if (boost > 1023) {
@@ -1815,11 +1799,10 @@
// If the frame that is to be boosted is simpler than the average for
// the gf/arf group then use an alternative calculation
// based on the error score of the frame itself
- if (cpi->rc.baseline_gf_interval < 1 ||
- mod_frame_err < gf_group_err / (double)cpi->rc.baseline_gf_interval) {
- double alt_gf_grp_bits =
- (double)cpi->twopass.kf_group_bits *
- (mod_frame_err * (double)cpi->rc.baseline_gf_interval) /
+ if (rc->baseline_gf_interval < 1 ||
+ mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) {
+ double alt_gf_grp_bits = (double)cpi->twopass.kf_group_bits *
+ (mod_frame_err * (double)rc->baseline_gf_interval) /
DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
@@ -1846,10 +1829,11 @@
if (i == 0) {
cpi->twopass.gf_bits = gf_bits;
}
- if (i == 1 || (!cpi->rc.source_alt_ref_pending
- && (cpi->common.frame_type != KEY_FRAME))) {
+ if (i == 1 ||
+ (!rc->source_alt_ref_pending &&
+ (cpi->common.frame_type != KEY_FRAME))) {
// Per frame bit target for this frame
- cpi->rc.per_frame_bandwidth = gf_bits;
+ rc->per_frame_bandwidth = gf_bits;
}
}
@@ -1867,7 +1851,7 @@
// the remaining bits amoung the other frames/
// For normal GFs remove the score for the GF itself unless this is
// also a key frame in which case it has already been accounted for.
- if (cpi->rc.source_alt_ref_pending) {
+ if (rc->source_alt_ref_pending) {
cpi->twopass.gf_group_error_left = (int64_t)gf_group_err - mod_frame_err;
} else if (cpi->common.frame_type != KEY_FRAME) {
cpi->twopass.gf_group_error_left = (int64_t)(gf_group_err
@@ -1884,9 +1868,8 @@
// This condition could fail if there are two kfs very close together
// despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
// calculation of alt_extra_bits.
- if (cpi->rc.baseline_gf_interval >= 3) {
- const int boost = cpi->rc.source_alt_ref_pending ?
- b_boost : cpi->rc.gfu_boost;
+ if (rc->baseline_gf_interval >= 3) {
+ const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost;
if (boost >= 150) {
int alt_extra_bits;
@@ -1905,7 +1888,7 @@
zero_stats(§ionstats);
reset_fpf_position(&cpi->twopass, start_pos);
- for (i = 0; i < cpi->rc.baseline_gf_interval; i++) {
+ for (i = 0; i < rc->baseline_gf_interval; i++) {
input_stats(&cpi->twopass, &next_frame);
accumulate_stats(§ionstats, &next_frame);
}
@@ -2028,6 +2011,7 @@
FIRSTPASS_STATS this_frame;
FIRSTPASS_STATS this_frame_copy;
+ RATE_CONTROL *rc = &cpi->rc;
double this_frame_intra_error;
double this_frame_coded_error;
@@ -2042,7 +2026,7 @@
vp9_clear_system_state();
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
- cpi->rc.active_worst_quality = cpi->oxcf.cq_level;
+ rc->active_worst_quality = cpi->oxcf.cq_level;
} else if (cpi->common.current_video_frame == 0) {
// Special case code for first frame.
int section_target_bandwidth =
@@ -2051,9 +2035,9 @@
tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
section_target_bandwidth);
- cpi->rc.active_worst_quality = tmp_q;
- cpi->rc.ni_av_qi = tmp_q;
- cpi->rc.avg_q = vp9_convert_qindex_to_q(tmp_q);
+ rc->active_worst_quality = tmp_q;
+ rc->ni_av_qi = tmp_q;
+ rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
// Limit the maxq value returned subsequently.
// This increases the risk of overspend or underspend if the initial
@@ -2070,7 +2054,7 @@
this_frame_coded_error = this_frame.coded_error;
// keyframe and section processing !
- if (cpi->rc.frames_to_key == 0) {
+ if (rc->frames_to_key == 0) {
// Define next KF group and assign bits to it
this_frame_copy = this_frame;
find_next_key_frame(cpi, &this_frame_copy);
@@ -2079,7 +2063,7 @@
}
// Is this a GF / ARF (Note that a KF is always also a GF)
- if (cpi->rc.frames_till_gf_update_due == 0) {
+ if (rc->frames_till_gf_update_due == 0) {
// Define next gf group and assign bits to it
this_frame_copy = this_frame;
@@ -2102,7 +2086,7 @@
cpi->enable_encode_breakout = 2;
}
- cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
cpi->refresh_golden_frame = 1;
} else {
// Otherwise this is an ordinary frame
@@ -2123,8 +2107,8 @@
}
// Set nominal per second bandwidth for this frame
- cpi->target_bandwidth = (int)(cpi->rc.per_frame_bandwidth
- * cpi->output_framerate);
+ cpi->target_bandwidth = (int)(rc->per_frame_bandwidth *
+ cpi->output_framerate);
if (cpi->target_bandwidth < 0)
cpi->target_bandwidth = 0;
@@ -2179,10 +2163,9 @@
// Cumulative effect of decay in prediction quality
if (local_next_frame.pcnt_inter > 0.85)
- decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+ decay_accumulator *= local_next_frame.pcnt_inter;
else
- decay_accumulator =
- decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+ decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
// decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
@@ -2241,6 +2224,8 @@
double kf_group_coded_err = 0.0;
double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+ RATE_CONTROL *const rc = &cpi->rc;
+
vp9_zero(next_frame);
vp9_clear_system_state(); // __asm emms;
@@ -2249,15 +2234,15 @@
cpi->common.frame_type = KEY_FRAME;
// is this a forced key frame by interval
- cpi->rc.this_key_frame_forced = cpi->rc.next_key_frame_forced;
+ rc->this_key_frame_forced = rc->next_key_frame_forced;
// Clear the alt ref active flag as this can never be active on a key frame
- cpi->rc.source_alt_ref_active = 0;
+ rc->source_alt_ref_active = 0;
// Kf is always a gf so clear frames till next gf counter
- cpi->rc.frames_till_gf_update_due = 0;
+ rc->frames_till_gf_update_due = 0;
- cpi->rc.frames_to_key = 1;
+ rc->frames_to_key = 1;
// Take a copy of the initial frame details
first_frame = *this_frame;
@@ -2309,14 +2294,14 @@
break;
// Step on to the next frame
- cpi->rc.frames_to_key++;
+ rc->frames_to_key++;
// If we don't have a real key frame within the next two
// forcekeyframeevery intervals then break out of the loop.
- if (cpi->rc.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
+ if (rc->frames_to_key >= 2 * (int)cpi->key_frame_frequency)
break;
} else {
- cpi->rc.frames_to_key++;
+ rc->frames_to_key++;
}
i++;
}
@@ -2325,11 +2310,11 @@
// We already breakout of the loop above at 2x max.
// This code centers the extra kf if the actual natural
// interval is between 1x and 2x
- if (cpi->oxcf.auto_key
- && cpi->rc.frames_to_key > (int)cpi->key_frame_frequency) {
+ if (cpi->oxcf.auto_key &&
+ rc->frames_to_key > (int)cpi->key_frame_frequency) {
FIRSTPASS_STATS tmp_frame;
- cpi->rc.frames_to_key /= 2;
+ rc->frames_to_key /= 2;
// Copy first frame details
tmp_frame = first_frame;
@@ -2342,7 +2327,7 @@
kf_group_coded_err = 0;
// Rescan to get the correct error data for the forced kf group
- for (i = 0; i < cpi->rc.frames_to_key; i++) {
+ for (i = 0; i < rc->frames_to_key; i++) {
// Accumulate kf group errors
kf_group_err += calculate_modified_err(cpi, &tmp_frame);
kf_group_intra_err += tmp_frame.intra_error;
@@ -2351,11 +2336,11 @@
// Load a the next frame's stats
input_stats(&cpi->twopass, &tmp_frame);
}
- cpi->rc.next_key_frame_forced = 1;
+ rc->next_key_frame_forced = 1;
} else if (cpi->twopass.stats_in == cpi->twopass.stats_in_end) {
- cpi->rc.next_key_frame_forced = 1;
+ rc->next_key_frame_forced = 1;
} else {
- cpi->rc.next_key_frame_forced = 0;
+ rc->next_key_frame_forced = 0;
}
// Special case for the last key frame of the file
@@ -2386,7 +2371,7 @@
cpi->twopass.modified_error_left));
// Clip based on maximum per frame rate defined by the user.
- max_grp_bits = (int64_t)max_bits * (int64_t)cpi->rc.frames_to_key;
+ max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
if (cpi->twopass.kf_group_bits > max_grp_bits)
cpi->twopass.kf_group_bits = max_grp_bits;
} else {
@@ -2402,7 +2387,7 @@
loop_decay_rate = 1.00; // Starting decay rate
// Scan through the kf group collating various stats.
- for (i = 0; i < cpi->rc.frames_to_key; i++) {
+ for (i = 0; i < rc->frames_to_key; i++) {
double r;
if (EOF == input_stats(&cpi->twopass, &next_frame))
@@ -2416,7 +2401,7 @@
}
// For the first few frames collect data to decide kf boost.
- if (i <= (cpi->rc.max_gf_interval * 2)) {
+ if (i <= (rc->max_gf_interval * 2)) {
if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
r = (IIKFACTOR2 * next_frame.intra_error /
DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
@@ -2445,16 +2430,15 @@
zero_stats(§ionstats);
reset_fpf_position(&cpi->twopass, start_position);
- for (i = 0; i < cpi->rc.frames_to_key; i++) {
+ for (i = 0; i < rc->frames_to_key; i++) {
input_stats(&cpi->twopass, &next_frame);
accumulate_stats(§ionstats, &next_frame);
}
avg_stats(§ionstats);
- cpi->twopass.section_intra_rating = (int)
- (sectionstats.intra_error
- / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+ cpi->twopass.section_intra_rating = (int) (sectionstats.intra_error /
+ DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
}
// Reset the first pass file position
@@ -2466,15 +2450,15 @@
int allocation_chunks;
int alt_kf_bits;
- if (kf_boost < (cpi->rc.frames_to_key * 3))
- kf_boost = (cpi->rc.frames_to_key * 3);
+ if (kf_boost < (rc->frames_to_key * 3))
+ kf_boost = (rc->frames_to_key * 3);
if (kf_boost < 300) // Min KF boost
kf_boost = 300;
// Make a note of baseline boost and the zero motion
// accumulator value for use elsewhere.
- cpi->rc.kf_boost = kf_boost;
+ rc->kf_boost = kf_boost;
cpi->twopass.kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
// We do three calculations for kf size.
@@ -2491,10 +2475,10 @@
// care of by kf_boost.
if (zero_motion_accumulator >= 0.99) {
allocation_chunks =
- ((cpi->rc.frames_to_key - 1) * 10) + kf_boost;
+ ((rc->frames_to_key - 1) * 10) + kf_boost;
} else {
allocation_chunks =
- ((cpi->rc.frames_to_key - 1) * 100) + kf_boost;
+ ((rc->frames_to_key - 1) * 100) + kf_boost;
}
// Prevent overflow
@@ -2504,22 +2488,21 @@
allocation_chunks /= divisor;
}
- cpi->twopass.kf_group_bits =
- (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+ cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0
+ : cpi->twopass.kf_group_bits;
// Calculate the number of bits to be spent on the key frame
- cpi->twopass.kf_bits =
- (int)((double)kf_boost *
+ cpi->twopass.kf_bits = (int)((double)kf_boost *
((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
// If the key frame is actually easier than the average for the
// kf group (which does sometimes happen... eg a blank intro frame)
// Then use an alternate calculation based on the kf error score
// which should give a smaller key frame.
- if (kf_mod_err < kf_group_err / cpi->rc.frames_to_key) {
+ if (kf_mod_err < kf_group_err / rc->frames_to_key) {
double alt_kf_grp_bits =
((double)cpi->twopass.bits_left *
- (kf_mod_err * (double)cpi->rc.frames_to_key) /
+ (kf_mod_err * (double)rc->frames_to_key) /
DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
alt_kf_bits = (int)((double)kf_boost *
@@ -2532,8 +2515,7 @@
// Else if it is much harder than other frames in the group make sure
// it at least receives an allocation in keeping with its relative
// error score
- alt_kf_bits =
- (int)((double)cpi->twopass.bits_left *
+ alt_kf_bits = (int)((double)cpi->twopass.bits_left *
(kf_mod_err /
DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
@@ -2545,7 +2527,7 @@
cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
// Peer frame bit target for this frame
- cpi->rc.per_frame_bandwidth = cpi->twopass.kf_bits;
+ rc->per_frame_bandwidth = cpi->twopass.kf_bits;
// Convert to a per second bitrate
cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
cpi->output_framerate);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index abdcf2f..2c7c86e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -59,6 +59,11 @@
#define DISABLE_COMPOUND_SPLIT 0x18
#define LAST_AND_INTRA_SPLIT_ONLY 0x1E
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
#if CONFIG_INTERNAL_STATS
extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *dest, int lumamask,
@@ -88,12 +93,6 @@
#endif
-#ifdef ENTROPY_STATS
-extern int intra_mode_stats[INTRA_MODES]
- [INTRA_MODES]
- [INTRA_MODES];
-#endif
-
#ifdef MODE_STATS
extern void init_tx_count_stats();
extern void write_tx_count_stats();
@@ -1093,6 +1092,9 @@
};
void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
+ VP9_COMMON *const cm = &cpi->common;
+ int64_t vbr_max_bits;
+
if (framerate < 0.1)
framerate = 30;
@@ -1109,6 +1111,19 @@
cpi->rc.min_frame_bandwidth = MAX(cpi->rc.min_frame_bandwidth,
FRAME_OVERHEAD_BITS);
+ // A maximum bitrate for a frame is defined.
+ // The baseline for this aligns with HW implementations that
+ // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+ // per 16x16 MB (averaged over a frame). However this limit is extended if
+ // a very high rate is given on the command line or the the rate cannnot
+ // be acheived because of a user specificed max q (e.g. when the user
+ // specifies lossless encode.
+ //
+ vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth *
+ (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+ cpi->rc.max_frame_bandwidth =
+ MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
// Set Maximum gf/arf interval
cpi->rc.max_gf_interval = 16;
@@ -1957,41 +1972,6 @@
}
#endif
-#ifdef ENTROPY_STATS
- {
- int i, j, k;
- FILE *fmode = fopen("vp9_modecontext.c", "w");
-
- fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
- fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
- fprintf(fmode, "[INTRA_MODES][INTRA_MODES]"
- "[INTRA_MODES] =\n{\n");
-
- for (i = 0; i < INTRA_MODES; i++) {
- fprintf(fmode, " { // Above Mode : %d\n", i);
-
- for (j = 0; j < INTRA_MODES; j++) {
- fprintf(fmode, " {");
-
- for (k = 0; k < INTRA_MODES; k++) {
- if (!intra_mode_stats[i][j][k])
- fprintf(fmode, " %5d, ", 1);
- else
- fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
- }
-
- fprintf(fmode, "}, // left_mode %d\n", j);
- }
-
- fprintf(fmode, " },\n");
- }
-
- fprintf(fmode, "};\n");
- fclose(fmode);
- }
-#endif
-
-
#if defined(SECTIONBITS_OUTPUT)
if (0) {
@@ -2443,10 +2423,14 @@
int force_recode = 0;
VP9_COMMON *cm = &cpi->common;
- // Is frame recode allowed at all
- // Yes if either recode mode 1 is selected or mode two is selected
- // and the frame is a key frame. golden frame or alt_ref_frame
- if ((cpi->sf.recode_loop == 1) ||
+ // Special case trap if maximum allowed frame size exceeded.
+ if (cpi->rc.projected_frame_size > cpi->rc.max_frame_bandwidth) {
+ force_recode = 1;
+
+ // Is frame recode allowed.
+ // Yes if either recode mode 1 is selected or mode 2 is selected
+ // and the frame is a key frame, golden frame or alt_ref_frame
+ } else if ((cpi->sf.recode_loop == 1) ||
((cpi->sf.recode_loop == 2) &&
((cm->frame_type == KEY_FRAME) ||
cpi->refresh_golden_frame ||
@@ -2624,7 +2608,8 @@
"%6d %6d %5d %5d %5d %10d %10.3f"
"%10.3f %8d %10d %10d %10d\n",
cpi->common.current_video_frame, cpi->rc.this_frame_target,
- cpi->rc.projected_frame_size, 0,
+ cpi->rc.projected_frame_size,
+ cpi->rc.projected_frame_size / cpi->common.MBs,
(cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
(int)cpi->rc.total_target_vs_actual,
(int)(cpi->oxcf.starting_buffer_level - cpi->rc.bits_off_target),
@@ -2734,8 +2719,9 @@
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
loop = 0;
} else {
- // Special case handling for forced key frames
- if ((cm->frame_type == KEY_FRAME) && cpi->rc.this_key_frame_forced) {
+ if ((cm->frame_type == KEY_FRAME) &&
+ cpi->rc.this_key_frame_forced &&
+ (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth)) {
int last_q = *q;
int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
@@ -2774,7 +2760,7 @@
loop = *q != last_q;
} else if (recode_loop_test(
cpi, frame_over_shoot_limit, frame_under_shoot_limit,
- *q, top_index, bottom_index)) {
+ *q, MAX(q_high, top_index), bottom_index)) {
// Is the projected frame size out of range and are we allowed
// to attempt to recode.
int last_q = *q;
@@ -2785,6 +2771,10 @@
// Frame is too large
if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
+ // Special case if the projected size is > the max allowed.
+ if (cpi->rc.projected_frame_size >= cpi->rc.max_frame_bandwidth)
+ q_high = cpi->rc.worst_quality;
+
// Raise Qlow as to at least the current value
q_low = *q < q_high ? *q + 1 : q_high;
@@ -2798,12 +2788,12 @@
vp9_rc_update_rate_correction_factors(cpi, 0);
*q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
- bottom_index, top_index);
+ bottom_index, MAX(q_high, top_index));
while (*q < q_low && retries < 10) {
vp9_rc_update_rate_correction_factors(cpi, 0);
*q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
- bottom_index, top_index);
+ bottom_index, MAX(q_high, top_index));
retries++;
}
}
@@ -2849,7 +2839,9 @@
}
}
- if (cpi->rc.is_src_frame_alt_ref)
+ // Special case for overlay frame.
+ if (cpi->rc.is_src_frame_alt_ref &&
+ (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth))
loop = 0;
if (loop) {
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index b1969f3..a665bf8 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -442,9 +442,10 @@
unsigned int source_alt_ref_active;
unsigned int is_src_frame_alt_ref;
- int per_frame_bandwidth; // Current section per frame bandwidth target
- int av_per_frame_bandwidth; // Average frame size target for clip
- int min_frame_bandwidth; // Minimum allocation used for any frame
+ int per_frame_bandwidth; // Current section per frame bandwidth target
+ int av_per_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation used for any frame
+ int max_frame_bandwidth; // Maximum burst rate allowed for a frame.
int ni_av_qi;
int ni_tot_qi;
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 58078ad..7a5282d 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -32,44 +32,6 @@
stride * (lines_to_copy + 16));
}
-static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *dest, int Fraction) {
- int i, j;
- int Total = 0;
- int srcoffset, dstoffset;
- uint8_t *src = source->y_buffer;
- uint8_t *dst = dest->y_buffer;
-
- int linestocopy = (source->y_height >> (Fraction + 4));
-
- if (linestocopy < 1)
- linestocopy = 1;
-
- linestocopy <<= 4;
-
-
- srcoffset = source->y_stride * (dest->y_height >> 5) * 16;
- dstoffset = dest->y_stride * (dest->y_height >> 5) * 16;
-
- src += srcoffset;
- dst += dstoffset;
-
- // Loop through the raw Y plane and reconstruction data summing the square
- // differences.
- for (i = 0; i < linestocopy; i += 16) {
- for (j = 0; j < source->y_width; j += 16) {
- unsigned int sse;
- Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
- &sse);
- }
-
- src += 16 * source->y_stride;
- dst += 16 * dest->y_stride;
- }
-
- return Total;
-}
-
// Enforce a minimum filter level based upon baseline Q
static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
int min_filter_level;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index aefef53..72ab00f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -258,25 +258,27 @@
// Update the buffer level: leaky bucket model.
void vp9_update_buffer_level(VP9_COMP *const cpi, int encoded_frame_size) {
VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
// Non-viewable frames are a special case and are treated as pure overhead.
if (!cm->show_frame) {
- cpi->rc.bits_off_target -= encoded_frame_size;
+ rc->bits_off_target -= encoded_frame_size;
} else {
- cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
- encoded_frame_size;
+ rc->bits_off_target += rc->av_per_frame_bandwidth - encoded_frame_size;
}
// Clip the buffer level to the maximum specified buffer size.
- if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size) {
- cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+ if (rc->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+ rc->bits_off_target = cpi->oxcf.maximum_buffer_size;
}
- cpi->rc.buffer_level = cpi->rc.bits_off_target;
+ rc->buffer_level = rc->bits_off_target;
}
int vp9_drop_frame(VP9_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
if (!cpi->oxcf.drop_frames_water_mark) {
return 0;
} else {
- if (cpi->rc.buffer_level < 0) {
+ if (rc->buffer_level < 0) {
// Always drop if buffer is below 0.
return 1;
} else {
@@ -284,23 +286,23 @@
// (starting with the next frame) until it increases back over drop_mark.
int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
cpi->oxcf.optimal_buffer_level / 100);
- if ((cpi->rc.buffer_level > drop_mark) &&
- (cpi->rc.decimation_factor > 0)) {
- --cpi->rc.decimation_factor;
- } else if (cpi->rc.buffer_level <= drop_mark &&
- cpi->rc.decimation_factor == 0) {
- cpi->rc.decimation_factor = 1;
+ if ((rc->buffer_level > drop_mark) &&
+ (rc->decimation_factor > 0)) {
+ --rc->decimation_factor;
+ } else if (rc->buffer_level <= drop_mark &&
+ rc->decimation_factor == 0) {
+ rc->decimation_factor = 1;
}
- if (cpi->rc.decimation_factor > 0) {
- if (cpi->rc.decimation_count > 0) {
- --cpi->rc.decimation_count;
+ if (rc->decimation_factor > 0) {
+ if (rc->decimation_count > 0) {
+ --rc->decimation_count;
return 1;
} else {
- cpi->rc.decimation_count = cpi->rc.decimation_factor;
+ rc->decimation_count = rc->decimation_factor;
return 0;
}
} else {
- cpi->rc.decimation_count = 0;
+ rc->decimation_count = 0;
return 0;
}
}
@@ -314,63 +316,65 @@
// If buffer is below the optimal level, let the active_worst_quality go from
// ambient Q (at buffer = optimal level) to worst_quality level
// (at buffer = critical level).
- int active_worst_quality = cpi->rc.active_worst_quality;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9_CONFIG *const oxcf = &cpi->oxcf;
+ int active_worst_quality = rc->active_worst_quality;
// Maximum limit for down adjustment, ~20%.
int max_adjustment_down = active_worst_quality / 5;
// Buffer level below which we push active_worst to worst_quality.
- int critical_level = cpi->oxcf.optimal_buffer_level >> 2;
+ int critical_level = oxcf->optimal_buffer_level >> 2;
int adjustment = 0;
int buff_lvl_step = 0;
- if (cpi->rc.buffer_level > cpi->oxcf.optimal_buffer_level) {
+ if (rc->buffer_level > oxcf->optimal_buffer_level) {
// Adjust down.
if (max_adjustment_down) {
- buff_lvl_step = (int)((cpi->oxcf.maximum_buffer_size -
- cpi->oxcf.optimal_buffer_level) / max_adjustment_down);
- if (buff_lvl_step) {
- adjustment = (int)((cpi->rc.buffer_level -
- cpi->oxcf.optimal_buffer_level) / buff_lvl_step);
- }
+ buff_lvl_step = (int)((oxcf->maximum_buffer_size -
+ oxcf->optimal_buffer_level) / max_adjustment_down);
+ if (buff_lvl_step)
+ adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+ buff_lvl_step);
active_worst_quality -= adjustment;
}
- } else if (cpi->rc.buffer_level > critical_level) {
+ } else if (rc->buffer_level > critical_level) {
// Adjust up from ambient Q.
if (critical_level) {
- buff_lvl_step = (cpi->oxcf.optimal_buffer_level - critical_level);
+ buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
if (buff_lvl_step) {
- adjustment =
- (cpi->rc.worst_quality - cpi->rc.avg_frame_qindex[INTER_FRAME]) *
- (cpi->oxcf.optimal_buffer_level - cpi->rc.buffer_level) /
- buff_lvl_step;
+ adjustment = (rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
+ (oxcf->optimal_buffer_level - rc->buffer_level) /
+ buff_lvl_step;
}
- active_worst_quality = cpi->rc.avg_frame_qindex[INTER_FRAME] + adjustment;
+ active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
}
} else {
// Set to worst_quality if buffer is below critical level.
- active_worst_quality = cpi->rc.worst_quality;
+ active_worst_quality = rc->worst_quality;
}
return active_worst_quality;
}
// Adjust target frame size with respect to the buffering constraints:
static int target_size_from_buffer_level(const VP9_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9_CONFIG *const oxcf = &cpi->oxcf;
int this_frame_target = cpi->rc.this_frame_target;
int percent_low = 0;
int percent_high = 0;
- int one_percent_bits = (int)(1 + cpi->oxcf.optimal_buffer_level / 100);
- if (cpi->rc.buffer_level < cpi->oxcf.optimal_buffer_level) {
- percent_low = (int)((cpi->oxcf.optimal_buffer_level - cpi->rc.buffer_level)
- / one_percent_bits);
- if (percent_low > cpi->oxcf.under_shoot_pct) {
- percent_low = cpi->oxcf.under_shoot_pct;
- }
+ int one_percent_bits = (int)(1 + oxcf->optimal_buffer_level / 100);
+ if (rc->buffer_level < oxcf->optimal_buffer_level) {
+ percent_low = (int)((oxcf->optimal_buffer_level - rc->buffer_level) /
+ one_percent_bits);
+ if (percent_low > oxcf->under_shoot_pct)
+ percent_low = oxcf->under_shoot_pct;
+
// Lower the target bandwidth for this frame.
this_frame_target -= (this_frame_target * percent_low) / 200;
- } else if (cpi->rc.buffer_level > cpi->oxcf.optimal_buffer_level) {
- percent_high = (int)((cpi->rc.buffer_level - cpi->oxcf.optimal_buffer_level)
- / one_percent_bits);
- if (percent_high > cpi->oxcf.over_shoot_pct) {
- percent_high = cpi->oxcf.over_shoot_pct;
- }
+ } else if (rc->buffer_level > oxcf->optimal_buffer_level) {
+ percent_high = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+ one_percent_bits);
+ if (percent_high > oxcf->over_shoot_pct)
+ percent_high = oxcf->over_shoot_pct;
+
// Increase the target bandwidth for this frame.
this_frame_target += (this_frame_target * percent_high) / 200;
}
@@ -378,25 +382,27 @@
}
static void calc_pframe_target_size(VP9_COMP *const cpi) {
- int min_frame_target = MAX(cpi->rc.min_frame_bandwidth,
- cpi->rc.av_per_frame_bandwidth >> 5);
+ RATE_CONTROL *const rc = &cpi->rc;
+ const VP9_CONFIG *const oxcf = &cpi->oxcf;
+ int min_frame_target = MAX(rc->min_frame_bandwidth,
+ rc->av_per_frame_bandwidth >> 5);
if (cpi->refresh_alt_ref_frame) {
// Special alt reference frame case
// Per frame bit target for the alt ref frame
- cpi->rc.per_frame_bandwidth = cpi->twopass.gf_bits;
- cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+ rc->per_frame_bandwidth = cpi->twopass.gf_bits;
+ rc->this_frame_target = rc->per_frame_bandwidth;
} else {
// Normal frames (gf and inter).
- cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+ rc->this_frame_target = rc->per_frame_bandwidth;
// Set target frame size based on buffer level, for 1 pass CBR.
- if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+ if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
// Need to decide how low min_frame_target should be for 1-pass CBR.
// For now, use: cpi->rc.av_per_frame_bandwidth / 16:
- min_frame_target = MAX(cpi->rc.av_per_frame_bandwidth >> 4,
+ min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
FRAME_OVERHEAD_BITS);
- cpi->rc.this_frame_target = target_size_from_buffer_level(cpi);
+ rc->this_frame_target = target_size_from_buffer_level(cpi);
// Adjust qp-max based on buffer level.
- cpi->rc.active_worst_quality =
+ rc->active_worst_quality =
adjust_active_worst_quality_from_buffer_level(cpi);
}
}
@@ -407,25 +413,24 @@
// not capable of recovering all the extra bits we have spent in the KF or GF,
// then the remainder will have to be recovered over a longer time span via
// other buffer / rate control mechanisms.
- if (cpi->rc.this_frame_target < min_frame_target) {
- cpi->rc.this_frame_target = min_frame_target;
- }
+ if (rc->this_frame_target < min_frame_target)
+ rc->this_frame_target = min_frame_target;
// Adjust target frame size for Golden Frames:
if (cpi->refresh_golden_frame) {
// If we are using alternate ref instead of gf then do not apply the boost
// It will instead be applied to the altref update
// Jims modified boost
- if (!cpi->rc.source_alt_ref_active) {
+ if (!rc->source_alt_ref_active) {
// The spend on the GF is defined in the two pass code
// for two pass encodes
- cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+ rc->this_frame_target = rc->per_frame_bandwidth;
} else {
// If there is an active ARF at this location use the minimum
// bits on this frame even if it is a constructed arf.
// The active maximum quantizer insures that an appropriate
// number of bits will be spent if needed for constructed ARFs.
- cpi->rc.this_frame_target = 0;
+ rc->this_frame_target = 0;
}
}
}
@@ -576,36 +581,34 @@
}
int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
- int *bottom_index,
- int *top_index) {
+ int *bottom_index, int *top_index) {
const VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9_CONFIG *const oxcf = &cpi->oxcf;
int active_best_quality;
- int active_worst_quality = cpi->rc.active_worst_quality;
+ int active_worst_quality = rc->active_worst_quality;
int q;
if (frame_is_intra_only(cm)) {
- active_best_quality = cpi->rc.best_quality;
+ active_best_quality = rc->best_quality;
#if !CONFIG_MULTIPLE_ARF
// Handle the special case for key frames forced when we have75 reached
// the maximum key frame interval. Here force the Q to a range
// based on the ambient Q to reduce the risk of popping.
- if (cpi->rc.this_key_frame_forced) {
- int delta_qindex;
- int qindex = cpi->rc.last_boosted_qindex;
+ if (rc->this_key_frame_forced) {
+ int qindex = rc->last_boosted_qindex;
double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-
- delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
- (last_boosted_q * 0.75));
- active_best_quality = MAX(qindex + delta_qindex,
- cpi->rc.best_quality);
- } else if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+ int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+ (last_boosted_q * 0.75));
+ active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+ } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
// not first frame of one pass
double q_adj_factor = 1.0;
double q_val;
// Baseline value derived from cpi->active_worst_quality and kf boost
active_best_quality = get_active_quality(active_worst_quality,
- cpi->rc.kf_boost,
+ rc->kf_boost,
kf_low, kf_high,
kf_low_motion_minq,
kf_high_motion_minq);
@@ -631,29 +634,29 @@
active_best_quality = active_worst_quality
+ vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
#endif
- } else if (!cpi->rc.is_src_frame_alt_ref &&
+ } else if (!rc->is_src_frame_alt_ref &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
// Use the lower of active_worst_quality and recent
// average Q as basis for GF/ARF best Q limit unless last frame was
// a key frame.
- if (cpi->rc.frames_since_key > 1 &&
- cpi->rc.avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
- q = cpi->rc.avg_frame_qindex[INTER_FRAME];
+ if (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = rc->avg_frame_qindex[INTER_FRAME];
} else {
q = active_worst_quality;
}
// For constrained quality dont allow Q less than the cq level
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) {
if (q < cpi->cq_target_quality)
q = cpi->cq_target_quality;
- if (cpi->rc.frames_since_key > 1) {
- active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
+ if (rc->frames_since_key > 1) {
+ active_best_quality = get_active_quality(q, rc->gfu_boost,
gf_low, gf_high,
afq_low_motion_minq,
afq_high_motion_minq);
} else {
- active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
+ active_best_quality = get_active_quality(q, rc->gfu_boost,
gf_low, gf_high,
gf_low_motion_minq,
gf_high_motion_minq);
@@ -661,46 +664,46 @@
// Constrained quality use slightly lower active best.
active_best_quality = active_best_quality * 15 / 16;
- } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ } else if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
if (!cpi->refresh_alt_ref_frame) {
active_best_quality = cpi->cq_target_quality;
} else {
- if (cpi->rc.frames_since_key > 1) {
+ if (rc->frames_since_key > 1) {
active_best_quality = get_active_quality(
- q, cpi->rc.gfu_boost, gf_low, gf_high,
+ q, rc->gfu_boost, gf_low, gf_high,
afq_low_motion_minq, afq_high_motion_minq);
} else {
active_best_quality = get_active_quality(
- q, cpi->rc.gfu_boost, gf_low, gf_high,
+ q, rc->gfu_boost, gf_low, gf_high,
gf_low_motion_minq, gf_high_motion_minq);
}
}
} else {
active_best_quality = get_active_quality(
- q, cpi->rc.gfu_boost, gf_low, gf_high,
+ q, rc->gfu_boost, gf_low, gf_high,
gf_low_motion_minq, gf_high_motion_minq);
}
} else {
- if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
active_best_quality = cpi->cq_target_quality;
} else {
if (cpi->pass == 0 &&
- cpi->rc.avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
// 1-pass: for now, use the average Q for the active_best, if its lower
// than active_worst.
- active_best_quality = inter_minq[cpi->rc.avg_frame_qindex[INTER_FRAME]];
+ active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
else
active_best_quality = inter_minq[active_worst_quality];
// For the constrained quality mode we don't want
// q to fall below the cq level.
- if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) &&
(active_best_quality < cpi->cq_target_quality)) {
// If we are strongly undershooting the target rate in the last
// frames then use the user passed in cq value not the auto
// cq value.
- if (cpi->rc.rolling_actual_bits < cpi->rc.min_frame_bandwidth)
- active_best_quality = cpi->oxcf.cq_level;
+ if (rc->rolling_actual_bits < rc->min_frame_bandwidth)
+ active_best_quality = oxcf->cq_level;
else
active_best_quality = cpi->cq_target_quality;
}
@@ -708,14 +711,14 @@
}
// Clip the active best and worst quality values to limits
- if (active_worst_quality > cpi->rc.worst_quality)
- active_worst_quality = cpi->rc.worst_quality;
+ if (active_worst_quality > rc->worst_quality)
+ active_worst_quality = rc->worst_quality;
- if (active_best_quality < cpi->rc.best_quality)
- active_best_quality = cpi->rc.best_quality;
+ if (active_best_quality < rc->best_quality)
+ active_best_quality = rc->best_quality;
- if (active_best_quality > cpi->rc.worst_quality)
- active_best_quality = cpi->rc.worst_quality;
+ if (active_best_quality > rc->worst_quality)
+ active_best_quality = rc->worst_quality;
if (active_worst_quality < active_best_quality)
active_worst_quality = active_best_quality;
@@ -725,29 +728,34 @@
#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
// Limit Q range for the adaptive loop.
- if (cm->frame_type == KEY_FRAME && !cpi->rc.this_key_frame_forced) {
- if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+ if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) {
+ if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
*top_index =
(active_worst_quality + active_best_quality * 3) / 4;
}
- } else if (!cpi->rc.is_src_frame_alt_ref &&
- (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
+ } else if (!rc->is_src_frame_alt_ref &&
+ (oxcf->end_usage != USAGE_STREAM_FROM_SERVER) &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
*top_index =
(active_worst_quality + active_best_quality) / 2;
}
#endif
- if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
q = active_best_quality;
// Special case code to try and match quality with forced key frames
- } else if ((cm->frame_type == KEY_FRAME) && cpi->rc.this_key_frame_forced) {
- q = cpi->rc.last_boosted_qindex;
+ } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+ q = rc->last_boosted_qindex;
} else {
- q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
active_best_quality, active_worst_quality);
- if (q > *top_index)
- q = *top_index;
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
}
#if CONFIG_MULTIPLE_ARF
// Force the quantizer determined by the coding order pattern.
@@ -766,12 +774,11 @@
printf("frame:%d q:%d\n", cm->current_video_frame, q);
}
#endif
- assert(*top_index <= cpi->rc.worst_quality &&
- *top_index >= cpi->rc.best_quality);
- assert(*bottom_index <= cpi->rc.worst_quality &&
- *bottom_index >= cpi->rc.best_quality);
- assert(q <= cpi->rc.worst_quality &&
- q >= cpi->rc.best_quality);
+ assert(*top_index <= rc->worst_quality &&
+ *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
return q;
}
@@ -810,6 +817,11 @@
*frame_under_shoot_limit -= 200;
if (*frame_under_shoot_limit < 0)
*frame_under_shoot_limit = 0;
+
+ // Clip to maximum allowed rate for a frame.
+ if (*frame_over_shoot_limit > cpi->rc.max_frame_bandwidth) {
+ *frame_over_shoot_limit = cpi->rc.max_frame_bandwidth;
+ }
}
}
@@ -822,6 +834,10 @@
else
calc_pframe_target_size(cpi);
+ // Clip the frame target to the maximum allowed value.
+ if (cpi->rc.this_frame_target > cpi->rc.max_frame_bandwidth)
+ cpi->rc.this_frame_target = cpi->rc.max_frame_bandwidth;
+
// Target rate per SB64 (including partial SB64s.
cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
(cpi->common.width * cpi->common.height);
@@ -843,24 +859,26 @@
}
static void update_golden_frame_stats(VP9_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
// Update the Golden frame usage counts.
if (cpi->refresh_golden_frame) {
// this frame refreshes means next frames don't unless specified by user
- cpi->rc.frames_since_golden = 0;
+ rc->frames_since_golden = 0;
- if (!cpi->rc.source_alt_ref_pending)
- cpi->rc.source_alt_ref_active = 0;
+ if (!rc->source_alt_ref_pending)
+ rc->source_alt_ref_active = 0;
// Decrement count down till next gf
- if (cpi->rc.frames_till_gf_update_due > 0)
- cpi->rc.frames_till_gf_update_due--;
+ if (rc->frames_till_gf_update_due > 0)
+ rc->frames_till_gf_update_due--;
} else if (!cpi->refresh_alt_ref_frame) {
// Decrement count down till next gf
- if (cpi->rc.frames_till_gf_update_due > 0)
- cpi->rc.frames_till_gf_update_due--;
+ if (rc->frames_till_gf_update_due > 0)
+ rc->frames_till_gf_update_due--;
- cpi->rc.frames_since_golden++;
+ rc->frames_since_golden++;
}
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index b46e808..5ba8915 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -134,27 +134,27 @@
return base + raster_block_offset(plane_bsize, raster_block, stride);
}
-static void fill_mode_costs(VP9_COMP *c) {
- VP9_COMMON *const cm = &c->common;
+static void fill_mode_costs(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
+ FRAME_CONTEXT *const fc = &cm->fc;
int i, j;
for (i = 0; i < INTRA_MODES; i++)
for (j = 0; j < INTRA_MODES; j++)
- vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+ vp9_cost_tokens((int *)x->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
vp9_intra_mode_tree);
// TODO(rbultje) separate tables for superblock costing?
- vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
- vp9_intra_mode_tree);
- vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
- cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
- vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
- vp9_kf_uv_mode_prob[INTRA_MODES - 1],
- vp9_intra_mode_tree);
+ vp9_cost_tokens(x->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
+ vp9_cost_tokens(x->intra_uv_mode_cost[1],
+ fc->uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
+ vp9_cost_tokens(x->intra_uv_mode_cost[0],
+ vp9_kf_uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
- vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
- cm->fc.switchable_interp_prob[i],
+ vp9_cost_tokens((int *)x->switchable_interp_costs[i],
+ fc->switchable_interp_prob[i],
vp9_switchable_interp_tree);
}
@@ -198,9 +198,9 @@
// This is to make it easier to resolve the impact of experimental changes
// to the quantizer tables.
for (i = 0; i < QINDEX_RANGE; i++) {
- sad_per_bit16lut[i] =
- (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
- sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
+ const double q = vp9_convert_qindex_to_q(i);
+ sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
+ sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
}
}
@@ -234,36 +234,30 @@
static void set_block_thresholds(VP9_COMP *cpi) {
int i, bsize, segment_id;
VP9_COMMON *cm = &cpi->common;
+ SPEED_FEATURES *sf = &cpi->sf;
for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
- int q;
- int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
- segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ);
- q = compute_rd_thresh_factor(segment_qindex);
+ const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
+ cm->base_qindex) + cm->y_dc_delta_q,
+ 0, MAXQ);
+ const int q = compute_rd_thresh_factor(qindex);
for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
// Threshold here seem unecessarily harsh but fine given actual
// range of values used for cpi->sf.thresh_mult[]
- int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+ const int t = q * rd_thresh_block_size_factor[bsize];
+ const int thresh_max = INT_MAX / t;
- for (i = 0; i < MAX_MODES; ++i) {
- if (cpi->sf.thresh_mult[i] < thresh_max) {
- cpi->rd_threshes[segment_id][bsize][i] =
- cpi->sf.thresh_mult[i] * q *
- rd_thresh_block_size_factor[bsize] / 4;
- } else {
- cpi->rd_threshes[segment_id][bsize][i] = INT_MAX;
- }
- }
+ for (i = 0; i < MAX_MODES; ++i)
+ cpi->rd_threshes[segment_id][bsize][i] =
+ sf->thresh_mult[i] < thresh_max ? sf->thresh_mult[i] * t / 4
+ : INT_MAX;
for (i = 0; i < MAX_REFS; ++i) {
- if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) {
- cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
- cpi->sf.thresh_mult_sub8x8[i] * q *
- rd_thresh_block_size_factor[bsize] / 4;
- } else {
- cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX;
- }
+ cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
+ sf->thresh_mult_sub8x8[i] < thresh_max
+ ? sf->thresh_mult_sub8x8[i] * t / 4
+ : INT_MAX;
}
}
}
@@ -271,6 +265,7 @@
void vp9_initialize_rd_consts(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
+ MACROBLOCK *x = &cpi->mb;
int qindex, i;
vp9_clear_system_state(); // __asm emms;
@@ -284,35 +279,32 @@
cpi->RDDIV = RDDIV_BITS; // in bits (to multiply D by 128)
cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
- cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
- cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+ x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO + (x->errorperbit == 0);
vp9_set_speed_features(cpi);
- cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
- cm->frame_type != KEY_FRAME) ?
- 0 : 1;
+ x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+ cm->frame_type != KEY_FRAME) ? 0 : 1;
set_block_thresholds(cpi);
- fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
+ fill_token_costs(x->token_costs, cm->fc.coef_probs);
for (i = 0; i < PARTITION_CONTEXTS; i++)
- vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
+ vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
vp9_partition_tree);
- /*rough estimate for costing*/
fill_mode_costs(cpi);
if (!frame_is_intra_only(cm)) {
- vp9_build_nmv_cost_table(
- cpi->mb.nmvjointcost,
- cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
- &cm->fc.nmvc,
- cm->allow_high_precision_mv, 1, 1);
+ vp9_build_nmv_cost_table(x->nmvjointcost,
+ cm->allow_high_precision_mv ? x->nmvcost_hp
+ : x->nmvcost,
+ &cm->fc.nmvc,
+ cm->allow_high_precision_mv, 1, 1);
for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
- vp9_cost_tokens((int *)cpi->mb.inter_mode_cost[i],
+ vp9_cost_tokens((int *)x->inter_mode_cost[i],
cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
}
}
@@ -464,8 +456,8 @@
BLOCK_SIZE bs;
struct macroblock_plane *const p = &x->plane[0];
struct macroblockd_plane *const pd = &xd->plane[0];
- const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
- const int height = 4 << num_4x4_blocks_high_lookup[bsize];
+ const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
+ const int height = 4 * num_4x4_blocks_high_lookup[bsize];
int rate_sum = 0;
int64_t dist_sum = 0;
const int t = 4 << tx_size;
@@ -640,7 +632,9 @@
struct rdcost_block_args *args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- struct encode_b_args encode_args = {x, NULL};
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ struct encode_b_args encode_args = {x, NULL, &mbmi->skip_coeff};
+
int64_t rd1, rd2, rd;
if (args->skip)
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 970a27a..b04e3fe 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -175,6 +175,18 @@
set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, aoff, loff);
}
+static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
+ int16_t extra, uint8_t token,
+ uint8_t skip_eob_node,
+ unsigned int *counts) {
+ (*t)->token = token;
+ (*t)->extra = extra;
+ (*t)->context_tree = context_tree;
+ (*t)->skip_eob_node = skip_eob_node;
+ (*t)++;
+ ++counts[token];
+}
+
static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
@@ -186,9 +198,9 @@
struct macroblockd_plane *pd = &xd->plane[plane];
MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
int pt; /* near block/prev token context index */
- int c = 0, rc = 0;
+ int c = 0;
TOKENEXTRA *t = *tp; /* store tokens starting here */
- const int eob = p->eobs[block];
+ int eob = p->eobs[block];
const PLANE_TYPE type = pd->plane_type;
const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
const int segment_id = mbmi->segment_id;
@@ -197,51 +209,53 @@
vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
const int ref = is_inter_block(mbmi);
- const uint8_t *const band_translate = get_band_translate(tx_size);
+ const uint8_t *const band = get_band_translate(tx_size);
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
int aoff, loff;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
- assert((!type && !plane) || (type && plane));
-
pt = get_entropy_context(tx_size, pd->above_context + aoff,
- pd->left_context + loff);
+ pd->left_context + loff);
so = get_scan(xd, tx_size, type, block);
scan = so->scan;
nb = so->neighbors;
-
c = 0;
- do {
- const int band = band_translate[c];
- int token;
+ while (c < eob) {
int v = 0;
- rc = scan[c];
- if (c)
+ int skip_eob = 0;
+ v = qcoeff_ptr[scan[c]];
+
+ while (!v) {
+ add_token(&t, coef_probs[type][ref][band[c]][pt], 0, ZERO_TOKEN, skip_eob,
+ counts[type][ref][band[c]][pt]);
+
+ cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] +=
+ !skip_eob;
+
+ skip_eob = 1;
+ token_cache[scan[c]] = 0;
+ ++c;
pt = get_coef_context(nb, token_cache, c);
- if (c < eob) {
- v = qcoeff_ptr[rc];
- assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE);
-
- t->extra = vp9_dct_value_tokens_ptr[v].extra;
- token = vp9_dct_value_tokens_ptr[v].token;
- } else {
- token = EOB_TOKEN;
+ v = qcoeff_ptr[scan[c]];
}
+ add_token(&t, coef_probs[type][ref][band[c]][pt],
+ vp9_dct_value_tokens_ptr[v].extra,
+ vp9_dct_value_tokens_ptr[v].token, skip_eob,
+ counts[type][ref][band[c]][pt]);
- t->token = token;
- t->context_tree = coef_probs[type][ref][band][pt];
- t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
+ cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] += !skip_eob;
- assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-
- ++counts[type][ref][band][pt][token];
- if (!t->skip_eob_node)
- ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt];
-
- token_cache[rc] = vp9_pt_energy_class[token];
- ++t;
- } while (c < eob && ++c < seg_eob);
+ token_cache[scan[c]] =
+ vp9_pt_energy_class[vp9_dct_value_tokens_ptr[v].token];
+ ++c;
+ pt = get_coef_context(nb, token_cache, c);
+ }
+ if (c < seg_eob) {
+ add_token(&t, coef_probs[type][ref][band[c]][pt], 0, EOB_TOKEN, 0,
+ counts[type][ref][band[c]][pt]);
+ ++cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt];
+ }
*tp = t;
@@ -285,8 +299,6 @@
const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_SKIP);
struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
-
- mbmi->skip_coeff = sb_is_skippable(&cpi->mb, bsize);
if (mbmi->skip_coeff) {
if (!dry_run)
cm->counts.mbskip[ctx][1] += skip_inc;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index b1c029c..478b45a 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,6 +74,9 @@
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
+ifeq ($(ARCH_X86_64),yes)
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
+endif
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vpxdec.c b/vpxdec.c
index 97ac4bb..4204979 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -40,13 +40,12 @@
char const *name;
const vpx_codec_iface_t *(*iface)(void);
uint32_t fourcc;
- uint32_t fourcc_mask;
} ifaces[] = {
#if CONFIG_VP8_DECODER
- {"vp8", vpx_codec_vp8_dx, VP8_FOURCC_MASK, 0x00FFFFFF},
+ {"vp8", vpx_codec_vp8_dx, VP8_FOURCC},
#endif
#if CONFIG_VP9_DECODER
- {"vp9", vpx_codec_vp9_dx, VP9_FOURCC_MASK, 0x00FFFFFF},
+ {"vp9", vpx_codec_vp9_dx, VP9_FOURCC},
#endif
};
@@ -167,11 +166,10 @@
exit(EXIT_FAILURE);
}
-static int raw_read_frame(struct VpxInputContext *input_ctx, uint8_t **buffer,
+static int raw_read_frame(FILE *infile, uint8_t **buffer,
size_t *bytes_read, size_t *buffer_size) {
char raw_hdr[RAW_FRAME_HDR_SZ];
size_t frame_size = 0;
- FILE *infile = input_ctx->file;
if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
if (!feof(infile))
@@ -221,10 +219,10 @@
return webm_read_frame(input->webm_ctx,
buf, bytes_in_buffer, buffer_size);
case FILE_TYPE_RAW:
- return raw_read_frame(input->vpx_input_ctx,
+ return raw_read_frame(input->vpx_input_ctx->file,
buf, bytes_in_buffer, buffer_size);
case FILE_TYPE_IVF:
- return ivf_read_frame(input->vpx_input_ctx,
+ return ivf_read_frame(input->vpx_input_ctx->file,
buf, bytes_in_buffer, buffer_size);
default:
return 1;
@@ -671,7 +669,7 @@
/* Try to determine the codec from the fourcc. */
for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
- if ((vpx_input_ctx.fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc) {
+ if (vpx_input_ctx.fourcc == ifaces[i].fourcc) {
vpx_codec_iface_t *vpx_iface = ifaces[i].iface();
if (iface && iface != vpx_iface)
diff --git a/vpxenc.c b/vpxenc.c
index 4c933ce..396e43d 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -362,12 +362,6 @@
"Motion detection threshold");
static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
"CPU Used (-16..16)");
-static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
- "Number of token partitions to use, log2");
-static const arg_def_t tile_cols = ARG_DEF(NULL, "tile-columns", 1,
- "Number of tile columns to use, log2");
-static const arg_def_t tile_rows = ARG_DEF(NULL, "tile-rows", 1,
- "Number of tile rows to use, log2");
static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
"Enable automatic alt reference frames");
static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
@@ -387,16 +381,10 @@
"Constant/Constrained Quality level");
static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
"Max I-frame bitrate (pct)");
-static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
-#if CONFIG_VP9_ENCODER
-static const arg_def_t frame_parallel_decoding = ARG_DEF(
- NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
-static const arg_def_t aq_mode = ARG_DEF(
- NULL, "aq-mode", 1,
- "Adaptive q mode (0: off (by default), 1: variance 2: complexity)");
-#endif
#if CONFIG_VP8_ENCODER
+static const arg_def_t token_parts =
+ ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
static const arg_def_t *vp8_args[] = {
&cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
&token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -414,6 +402,17 @@
#endif
#if CONFIG_VP9_ENCODER
+static const arg_def_t tile_cols =
+ ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows =
+ ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
+static const arg_def_t frame_parallel_decoding = ARG_DEF(
+ NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
+static const arg_def_t aq_mode = ARG_DEF(
+ NULL, "aq-mode", 1,
+ "Adaptive q mode (0: off (by default), 1: variance 2: complexity)");
+
static const arg_def_t *vp9_args[] = {
&cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
&tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -1393,6 +1392,10 @@
static void open_output_file(struct stream_state *stream,
struct VpxEncoderConfig *global) {
const char *fn = stream->config.out_fn;
+ const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+ if (cfg->g_pass == VPX_RC_FIRST_PASS)
+ return;
stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
@@ -1404,18 +1407,23 @@
if (stream->config.write_webm) {
stream->ebml.stream = stream->file;
- write_webm_file_header(&stream->ebml, &stream->config.cfg,
+ write_webm_file_header(&stream->ebml, cfg,
&global->framerate,
stream->config.stereo_fmt,
global->codec->fourcc);
- } else
- ivf_write_file_header(stream->file, &stream->config.cfg,
- global->codec->fourcc, 0);
+ } else {
+ ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
+ }
}
static void close_output_file(struct stream_state *stream,
- unsigned int fourcc) {
+ unsigned int fourcc) {
+ const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+ if (cfg->g_pass == VPX_RC_FIRST_PASS)
+ return;
+
if (stream->config.write_webm) {
write_webm_file_footer(&stream->ebml, stream->hash);
free(stream->ebml.cue_list);
diff --git a/webmdec.c b/webmdec.c
index 0c75d7a..fdcf3a5 100644
--- a/webmdec.c
+++ b/webmdec.c
@@ -82,9 +82,9 @@
codec_id = nestegg_track_codec_id(webm_ctx->nestegg_ctx, i);
if (codec_id == NESTEGG_CODEC_VP8) {
- vpx_ctx->fourcc = VP8_FOURCC_MASK;
+ vpx_ctx->fourcc = VP8_FOURCC;
} else if (codec_id == NESTEGG_CODEC_VP9) {
- vpx_ctx->fourcc = VP9_FOURCC_MASK;
+ vpx_ctx->fourcc = VP9_FOURCC;
} else {
fatal("Not VPx video, quitting.\n");
}