Merge "Remove one shot q experiment"
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index bac40c5..d0d4852 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -74,10 +74,6 @@
MB_MODE_COUNT
} MB_PREDICTION_MODE;
-static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
- return mode <= TM_PRED;
-}
-
static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
return mode >= NEARESTMV && mode <= NEWMV;
}
@@ -421,7 +417,7 @@
*y = (raster_mb >> tx_cols_log2) << tx_size;
}
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
+static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
int plane, int block, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
uint8_t *const buf = pd->dst.buf;
@@ -461,7 +457,7 @@
}
}
}
-static void set_contexts_on_border(MACROBLOCKD *xd,
+static void set_contexts_on_border(const MACROBLOCKD *xd,
struct macroblockd_plane *pd,
BLOCK_SIZE plane_bsize,
int tx_size_in_blocks, int has_eob,
@@ -499,7 +495,7 @@
L[pt] = 0;
}
-static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff) {
ENTROPY_CONTEXT *const A = pd->above_context + aoff;
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 2640ac7..d3a867c 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -322,9 +322,8 @@
vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
cm->counts.eob_branch[tx_size];
- int t, i, j, k, l;
+ int i, j, k, l, m;
unsigned int branch_ct[UNCONSTRAINED_NODES][2];
- vp9_prob coef_probs[UNCONSTRAINED_NODES];
for (i = 0; i < BLOCK_TYPES; ++i)
for (j = 0; j < REF_TYPES; ++j)
@@ -332,15 +331,14 @@
for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
if (l >= 3 && k == 0)
continue;
- vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
- branch_ct, coef_counts[i][j][k][l],
- 0);
+ vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct,
+ coef_counts[i][j][k][l], 0);
branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
- coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
- for (t = 0; t < UNCONSTRAINED_NODES; ++t)
- dst_coef_probs[i][j][k][l][t] = merge_probs(
- pre_coef_probs[i][j][k][l][t], coef_probs[t],
- branch_ct[t], count_sat, update_factor);
+ for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ dst_coef_probs[i][j][k][l][m] = merge_probs(
+ pre_coef_probs[i][j][k][l][m],
+ branch_ct[m],
+ count_sat, update_factor);
}
}
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index ec7d09a..c58e852 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -153,8 +153,8 @@
void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
-static int get_entropy_context(TX_SIZE tx_size,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+ const ENTROPY_CONTEXT *l) {
ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
switch (tx_size) {
@@ -163,16 +163,16 @@
left_ec = l[0] != 0;
break;
case TX_8X8:
- above_ec = !!*(uint16_t *)a;
- left_ec = !!*(uint16_t *)l;
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint16_t *)l;
break;
case TX_16X16:
- above_ec = !!*(uint32_t *)a;
- left_ec = !!*(uint32_t *)l;
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint32_t *)l;
break;
case TX_32X32:
- above_ec = !!*(uint64_t *)a;
- left_ec = !!*(uint64_t *)l;
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint64_t *)l;
break;
default:
assert(!"Invalid transform size.");
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 21c91d6..c4d7c38 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -161,51 +161,52 @@
{ 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm
};
-static const vp9_prob default_partition_probs[FRAME_TYPES][PARTITION_CONTEXTS]
+const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+ [PARTITION_TYPES - 1] = {
+ // 8x8 -> 4x4
+ { 158, 97, 94 }, // a/l both not split
+ { 93, 24, 99 }, // a split, l not split
+ { 85, 119, 44 }, // l split, a not split
+ { 62, 59, 67 }, // a/l both split
+ // 16x16 -> 8x8
+ { 149, 53, 53 }, // a/l both not split
+ { 94, 20, 48 }, // a split, l not split
+ { 83, 53, 24 }, // l split, a not split
+ { 52, 18, 18 }, // a/l both split
+ // 32x32 -> 16x16
+ { 150, 40, 39 }, // a/l both not split
+ { 78, 12, 26 }, // a split, l not split
+ { 67, 33, 11 }, // l split, a not split
+ { 24, 7, 5 }, // a/l both split
+ // 64x64 -> 32x32
+ { 174, 35, 49 }, // a/l both not split
+ { 68, 11, 27 }, // a split, l not split
+ { 57, 15, 9 }, // l split, a not split
+ { 12, 3, 3 }, // a/l both split
+};
+
+static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
[PARTITION_TYPES - 1] = {
- { // frame_type = keyframe
- // 8x8 -> 4x4
- { 158, 97, 94 }, // a/l both not split
- { 93, 24, 99 }, // a split, l not split
- { 85, 119, 44 }, // l split, a not split
- { 62, 59, 67 }, // a/l both split
- // 16x16 -> 8x8
- { 149, 53, 53 }, // a/l both not split
- { 94, 20, 48 }, // a split, l not split
- { 83, 53, 24 }, // l split, a not split
- { 52, 18, 18 }, // a/l both split
- // 32x32 -> 16x16
- { 150, 40, 39 }, // a/l both not split
- { 78, 12, 26 }, // a split, l not split
- { 67, 33, 11 }, // l split, a not split
- { 24, 7, 5 }, // a/l both split
- // 64x64 -> 32x32
- { 174, 35, 49 }, // a/l both not split
- { 68, 11, 27 }, // a split, l not split
- { 57, 15, 9 }, // l split, a not split
- { 12, 3, 3 }, // a/l both split
- }, { // frame_type = interframe
- // 8x8 -> 4x4
- { 199, 122, 141 }, // a/l both not split
- { 147, 63, 159 }, // a split, l not split
- { 148, 133, 118 }, // l split, a not split
- { 121, 104, 114 }, // a/l both split
- // 16x16 -> 8x8
- { 174, 73, 87 }, // a/l both not split
- { 92, 41, 83 }, // a split, l not split
- { 82, 99, 50 }, // l split, a not split
- { 53, 39, 39 }, // a/l both split
- // 32x32 -> 16x16
- { 177, 58, 59 }, // a/l both not split
- { 68, 26, 63 }, // a split, l not split
- { 52, 79, 25 }, // l split, a not split
- { 17, 14, 12 }, // a/l both split
- // 64x64 -> 32x32
- { 222, 34, 30 }, // a/l both not split
- { 72, 16, 44 }, // a split, l not split
- { 58, 32, 12 }, // l split, a not split
- { 10, 7, 6 }, // a/l both split
- }
+ // 8x8 -> 4x4
+ { 199, 122, 141 }, // a/l both not split
+ { 147, 63, 159 }, // a split, l not split
+ { 148, 133, 118 }, // l split, a not split
+ { 121, 104, 114 }, // a/l both split
+ // 16x16 -> 8x8
+ { 174, 73, 87 }, // a/l both not split
+ { 92, 41, 83 }, // a split, l not split
+ { 82, 99, 50 }, // l split, a not split
+ { 53, 39, 39 }, // a/l both split
+ // 32x32 -> 16x16
+ { 177, 58, 59 }, // a/l both not split
+ { 68, 26, 63 }, // a split, l not split
+ { 52, 79, 25 }, // l split, a not split
+ { 17, 14, 12 }, // a/l both split
+ // 64x64 -> 32x32
+ { 222, 34, 30 }, // a/l both not split
+ { 72, 16, 44 }, // a split, l not split
+ { 58, 32, 12 }, // l split, a not split
+ { 10, 7, 6 }, // a/l both split
};
static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
@@ -349,13 +350,8 @@
#define COUNT_SAT 20
#define MAX_UPDATE_FACTOR 128
-static int update_ct(vp9_prob pre_prob, vp9_prob prob,
- const unsigned int ct[2]) {
- return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static int update_ct2(vp9_prob pre_prob, const unsigned int ct[2]) {
- return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+static int update_ct(vp9_prob pre_prob, const unsigned int ct[2]) {
+ return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
}
static void update_mode_probs(int n_modes,
@@ -364,14 +360,13 @@
const vp9_prob *pre_probs, vp9_prob *dst_probs,
unsigned int tok0_offset) {
#define MAX_PROBS 32
- vp9_prob probs[MAX_PROBS];
unsigned int branch_ct[MAX_PROBS][2];
int t;
assert(n_modes - 1 < MAX_PROBS);
- vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
+ vp9_tree_probs_from_distribution(tree, branch_ct, cnt, tok0_offset);
for (t = 0; t < n_modes - 1; ++t)
- dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
+ dst_probs[t] = update_ct(pre_probs[t], branch_ct[t]);
}
void vp9_adapt_mode_probs(VP9_COMMON *cm) {
@@ -381,18 +376,18 @@
const FRAME_COUNTS *counts = &cm->counts;
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
- counts->intra_inter[i]);
+ fc->intra_inter_prob[i] = update_ct(pre_fc->intra_inter_prob[i],
+ counts->intra_inter[i]);
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
- counts->comp_inter[i]);
+ fc->comp_inter_prob[i] = update_ct(pre_fc->comp_inter_prob[i],
+ counts->comp_inter[i]);
for (i = 0; i < REF_CONTEXTS; i++)
- fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
- counts->comp_ref[i]);
+ fc->comp_ref_prob[i] = update_ct(pre_fc->comp_ref_prob[i],
+ counts->comp_ref[i]);
for (i = 0; i < REF_CONTEXTS; i++)
for (j = 0; j < 2; j++)
- fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
- counts->single_ref[i][j]);
+ fc->single_ref_prob[i][j] = update_ct(pre_fc->single_ref_prob[i][j],
+ counts->single_ref[i][j]);
for (i = 0; i < INTER_MODE_CONTEXTS; i++)
update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
@@ -410,10 +405,8 @@
fc->uv_mode_prob[i], 0);
for (i = 0; i < PARTITION_CONTEXTS; i++)
- update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
- counts->partition[i],
- pre_fc->partition_prob[INTER_FRAME][i],
- fc->partition_prob[INTER_FRAME][i], 0);
+ update_mode_probs(PARTITION_TYPES, vp9_partition_tree, counts->partition[i],
+ pre_fc->partition_prob[i], fc->partition_prob[i], 0);
if (cm->mcomp_filter_type == SWITCHABLE) {
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
@@ -432,24 +425,23 @@
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
for (j = 0; j < TX_SIZES - 3; ++j)
- fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
- branch_ct_8x8p[j]);
+ fc->tx_probs.p8x8[i][j] = update_ct(pre_fc->tx_probs.p8x8[i][j],
+ branch_ct_8x8p[j]);
tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
for (j = 0; j < TX_SIZES - 2; ++j)
- fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
- branch_ct_16x16p[j]);
+ fc->tx_probs.p16x16[i][j] = update_ct(pre_fc->tx_probs.p16x16[i][j],
+ branch_ct_16x16p[j]);
tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
for (j = 0; j < TX_SIZES - 1; ++j)
- fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
- branch_ct_32x32p[j]);
+ fc->tx_probs.p32x32[i][j] = update_ct(pre_fc->tx_probs.p32x32[i][j],
+ branch_ct_32x32p[j]);
}
}
for (i = 0; i < MBSKIP_CONTEXTS; ++i)
- fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
- counts->mbskip[i]);
+ fc->mbskip_probs[i] = update_ct(pre_fc->mbskip_probs[i], counts->mbskip[i]);
}
static void set_default_lf_deltas(struct loopfilter *lf) {
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ea96555..38b4199 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -38,6 +38,9 @@
extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
[INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+ [PARTITION_TYPES - 1];
+
extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index f70b571..3ebb701 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -191,7 +191,7 @@
}
static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
- return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
+ return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
}
static unsigned int adapt_probs(unsigned int i,
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 8f24052..79ace14 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -97,19 +97,15 @@
{ 0, -3, 1, 38, 64, 32, -1, -3}
};
+
+static const subpel_kernel* vp9_filter_kernels[4] = {
+ vp9_sub_pel_filters_8,
+ vp9_sub_pel_filters_8lp,
+ vp9_sub_pel_filters_8s,
+ vp9_bilinear_filters
+};
+
const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) {
- switch (type) {
- case EIGHTTAP:
- return vp9_sub_pel_filters_8;
- case EIGHTTAP_SMOOTH:
- return vp9_sub_pel_filters_8lp;
- case EIGHTTAP_SHARP:
- return vp9_sub_pel_filters_8s;
- case BILINEAR:
- return vp9_bilinear_filters;
- default:
- assert(!"Invalid interpolation type.");
- return NULL;
- }
+ return vp9_filter_kernels[type];
}
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 8652a6e..b1e7e64 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -39,7 +39,6 @@
const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_6[SUBPEL_SHIFTS];
extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index ba2e9d8..a2af57a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -41,7 +41,7 @@
typedef struct frame_contexts {
vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
- vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+ vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS - 1];
@@ -245,6 +245,11 @@
return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
}
+static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) {
+ return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
+ : cm->fc.partition_prob[ctx];
+}
+
static INLINE void set_skip_context(
MACROBLOCKD *xd,
ENTROPY_CONTEXT *above_context[MAX_MB_PLANE],
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 53b9003..1c96788 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -161,13 +161,12 @@
// scaling case. It needs to be done on the scaled MV, not the pre-scaling
// MV. Note however that it performs the subsampling aware scaling so
// that the result is always q4.
- const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
- pd->subsampling_x,
- pd->subsampling_y);
+ // mv_precision precision is MV_PRECISION_Q4.
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+ pd->subsampling_x,
+ pd->subsampling_y);
uint8_t *pre;
- // mv_precision precision is MV_PRECISION_Q4.
- const MV mv_q4 = {res_mv.row, res_mv.col };
MV32 scaled_mv;
int xs, ys;
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index bd609dc..eb643b0 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -369,7 +369,7 @@
}
}
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
TX_SIZE tx_size, int mode,
const uint8_t *ref, int ref_stride,
uint8_t *dst, int dst_stride) {
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index e9d0dbf..6e3f55c 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,8 +14,8 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_blockd.h"
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
- TX_SIZE tx_size, int mode,
- const uint8_t *ref, int ref_stride,
- uint8_t *dst, int dst_stride);
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+ TX_SIZE tx_size, int mode,
+ const uint8_t *ref, int ref_stride,
+ uint8_t *dst, int dst_stride);
#endif // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 5e049c6..3f3268f 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -200,7 +200,7 @@
specialize vp9_loop_filter_vertical_edge mmx neon
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_horizontal_edge sse2 neon
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index a5c8463..14a1a7e 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -191,8 +191,7 @@
}
static INLINE int get_coef_context(const int16_t *neighbors,
- uint8_t *token_cache,
- int c) {
+ const uint8_t *token_cache, int c) {
return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
}
diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c
index da1213d..1805fb4 100644
--- a/vp9/common/vp9_treecoder.c
+++ b/vp9/common/vp9_treecoder.c
@@ -40,9 +40,7 @@
tree2tok(p - offset, t, 0, 0, 0);
}
-static unsigned int convert_distribution(unsigned int i,
- vp9_tree tree,
- vp9_prob probs[],
+static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
unsigned int branch_ct[][2],
const unsigned int num_events[],
unsigned int tok0_offset) {
@@ -51,24 +49,25 @@
if (tree[i] <= 0) {
left = num_events[-tree[i] - tok0_offset];
} else {
- left = convert_distribution(tree[i], tree, probs, branch_ct,
- num_events, tok0_offset);
+ left = convert_distribution(tree[i], tree, branch_ct, num_events,
+ tok0_offset);
}
if (tree[i + 1] <= 0)
right = num_events[-tree[i + 1] - tok0_offset];
else
- right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
- num_events, tok0_offset);
+ right = convert_distribution(tree[i + 1], tree, branch_ct, num_events,
+ tok0_offset);
- probs[i>>1] = get_binary_prob(left, right);
- branch_ct[i>>1][0] = left;
- branch_ct[i>>1][1] = right;
+ branch_ct[i >> 1][0] = left;
+ branch_ct[i >> 1][1] = right;
return left + right;
}
-void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[/* n-1 */],
+void vp9_tree_probs_from_distribution(vp9_tree tree,
unsigned int branch_ct[/* n-1 */][2],
const unsigned int num_events[/* n */],
unsigned int tok0_offset) {
- convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);
+ convert_distribution(0, tree, branch_ct, num_events, tok0_offset);
}
+
+
diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h
index 4ba171f..3cc9ce1 100644
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -50,11 +50,11 @@
probability updates. */
void vp9_tree_probs_from_distribution(vp9_tree tree,
- vp9_prob probs[ /* n - 1 */ ],
unsigned int branch_ct[ /* n - 1 */ ][2],
const unsigned int num_events[ /* n */ ],
unsigned int tok0_offset);
+
static INLINE vp9_prob clip_prob(int p) {
return (p > 255) ? 255u : (p < 1) ? 1u : p;
}
@@ -81,22 +81,15 @@
return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
}
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
const unsigned int ct[2],
unsigned int count_sat,
unsigned int max_update_factor) {
+ const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
const unsigned int count = MIN(ct[0] + ct[1], count_sat);
const unsigned int factor = max_update_factor * count / count_sat;
return weighted_prob(pre_prob, prob, factor);
}
-static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
- const unsigned int ct[2],
- unsigned int count_sat,
- unsigned int max_update_factor) {
- return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
- max_update_factor);
-}
-
#endif // VP9_COMMON_VP9_TREECODER_H_
diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm
index 568e208..88df9b2 100644
--- a/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -991,7 +991,7 @@
lea dst8q, [dst8q+strideq*4]
; output 2nd half of 3rd 8 lines and half of 4th 8 lines
- mova m0, [sh_b23456789abcdefff]
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
mova [dstq +16], m7
mova [dst8q ], m7
pshufb m7, m0
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 0000000..3c5cb8f
--- /dev/null
+++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ const __m128i thresh = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _limit[0]));
+ const __m128i blimit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _blimit[0]));
+
+ q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+ _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+ _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+ _mm_subs_epu8(p1q1, q1p1));
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+ _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+ _mm_subs_epu8(q2p2, q3p3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(
+ _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+ _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+ _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+ (__m64 *) (s + 5 * p)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+ (__m64 *) (s + 6 * p)));
+
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+ _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+ _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+ (__m64 *) (s + 7 * p)));
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+ _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+ _mm_subs_epu8(q0p0, q7p7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+ _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+ _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p = _mm_add_epi16(eight,
+ _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(four,
+ _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+ 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+ 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+ 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+ 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+ 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+ 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+ 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+ }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+
+ const __m128i thresh = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _limit[0]));
+ const __m128i blimit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _blimit[0]));
+
+ p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+ {
+ const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+ _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+ _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+ _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+ _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+ flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+ flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+ flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+ q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+ q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+ q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+ q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+ p256_0, q256_0;
+ __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+ res_q;
+
+ p256_7 = _mm256_cvtepu8_epi16(p7);
+ p256_6 = _mm256_cvtepu8_epi16(p6);
+ p256_5 = _mm256_cvtepu8_epi16(p5);
+ p256_4 = _mm256_cvtepu8_epi16(p4);
+ p256_3 = _mm256_cvtepu8_epi16(p3);
+ p256_2 = _mm256_cvtepu8_epi16(p2);
+ p256_1 = _mm256_cvtepu8_epi16(p1);
+ p256_0 = _mm256_cvtepu8_epi16(p0);
+ q256_0 = _mm256_cvtepu8_epi16(q0);
+ q256_1 = _mm256_cvtepu8_epi16(q1);
+ q256_2 = _mm256_cvtepu8_epi16(q2);
+ q256_3 = _mm256_cvtepu8_epi16(q3);
+ q256_4 = _mm256_cvtepu8_epi16(q4);
+ q256_5 = _mm256_cvtepu8_epi16(q5);
+ q256_6 = _mm256_cvtepu8_epi16(q6);
+ q256_7 = _mm256_cvtepu8_epi16(q7);
+
+ pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+ _mm256_add_epi16(p256_4, p256_3));
+ pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+ _mm256_add_epi16(q256_4, q256_3));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+ _mm256_add_epi16(p256_2, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+ _mm256_add_epi16(q256_2, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(eight,
+ _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+ _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(p256_7, p256_0)), 4);
+
+ flat2_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(q256_7, q256_0)), 4);
+
+ flat2_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(p256_3, p256_0)), 3);
+
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(q256_3, q256_0)), 3);
+
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+ flat2_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+ flat2_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+ flat2_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+ flat2_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+ flat2_p3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+ flat2_q3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+ flat2_p4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+ flat2_q4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+ flat2_p5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+ flat2_q5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+ flat2_p6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+ flat2_q6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ p6 = _mm_andnot_si128(flat2, p6);
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ p6 = _mm_or_si128(flat2_p6, p6);
+ _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ p5 = _mm_or_si128(flat2_p5, p5);
+ _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ p4 = _mm_or_si128(flat2_p4, p4);
+ _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ p3 = _mm_or_si128(flat2_p3, p3);
+ _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ p2 = _mm_or_si128(flat2_p2, p2);
+ _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ p1 = _mm_or_si128(flat2_p1, p1);
+ _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ p0 = _mm_or_si128(flat2_p0, p0);
+ _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+ q0 = _mm_andnot_si128(flat2, q0);
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ q0 = _mm_or_si128(flat2_q0, q0);
+ _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+ q1 = _mm_andnot_si128(flat2, q1);
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ q1 = _mm_or_si128(flat2_q1, q1);
+ _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+ q2 = _mm_andnot_si128(flat2, q2);
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ q2 = _mm_or_si128(flat2_q2, q2);
+ _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+ q3 = _mm_andnot_si128(flat2, q3);
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ q3 = _mm_or_si128(flat2_q3, q3);
+ _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+ q4 = _mm_andnot_si128(flat2, q4);
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ q4 = _mm_or_si128(flat2_q4, q4);
+ _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+ q5 = _mm_andnot_si128(flat2, q5);
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ q5 = _mm_or_si128(flat2_q5, q5);
+ _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+ q6 = _mm_andnot_si128(flat2, q6);
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ q6 = _mm_or_si128(flat2_q6, q6);
+ _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+ }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh, int count) {
+ if (count == 1)
+ mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+ else
+ mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 475a299..1ca5786 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -149,16 +149,17 @@
return segment_id;
}
-static uint8_t read_skip_coeff(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- int segment_id, vp9_reader *r) {
- int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
- if (!skip_coeff) {
+static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd,
+ int segment_id, vp9_reader *r) {
+ if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
const int ctx = vp9_get_pred_context_mbskip(xd);
- skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
+ const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.mbskip[ctx][skip_coeff];
+ ++cm->counts.mbskip[ctx][skip];
+ return skip;
}
- return skip_coeff;
}
static void read_intra_frame_mode_info(VP9_COMMON *const cm,
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index bf3a101..63b889d 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -244,9 +244,8 @@
aligned_mi_cols));
}
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- MACROBLOCKD* const xd = arg;
+static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
const int stride = pd->dst.stride;
@@ -263,7 +262,7 @@
if (tx_type == DCT_DCT)
xd->itxm_add(qcoeff, dst, stride, eob);
else
- vp9_iht4x4_add(tx_type, qcoeff, dst, stride, eob);
+ vp9_iht4x4_16_add(qcoeff, dst, stride, tx_type);
break;
case TX_8X8:
tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -286,15 +285,27 @@
} else {
if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
vpx_memset(qcoeff, 0, 4 * (4 << tx_size) * sizeof(qcoeff[0]));
+ else if (tx_size == TX_32X32 && eob <= 34)
+ vpx_memset(qcoeff, 0, 256 * sizeof(qcoeff[0]));
else
vpx_memset(qcoeff, 0, (16 << (tx_size << 1)) * sizeof(qcoeff[0]));
}
}
}
-static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- MACROBLOCKD* const xd = arg;
+struct intra_args {
+ VP9_COMMON *cm;
+ MACROBLOCKD *xd;
+ vp9_reader *r;
+};
+
+static void predict_and_reconstruct_intra_block(int plane, int block,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct intra_args *const args = arg;
+ VP9_COMMON *const cm = args->cm;
+ MACROBLOCKD *const xd = args->xd;
+
struct macroblockd_plane *const pd = &xd->plane[plane];
MODE_INFO *const mi = xd->mi_8x8[0];
const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
@@ -313,25 +324,30 @@
b_width_log2(plane_bsize), tx_size, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
- if (!mi->mbmi.skip_coeff)
- decode_block(plane, block, plane_bsize, tx_size, arg);
+ if (!mi->mbmi.skip_coeff) {
+ vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
+ args->r);
+ inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+ }
}
-static int decode_tokens(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- BLOCK_SIZE bsize, vp9_reader *r) {
- MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+struct inter_args {
+ VP9_COMMON *cm;
+ MACROBLOCKD *xd;
+ vp9_reader *r;
+ int *eobtotal;
+};
- if (mbmi->skip_coeff) {
- reset_skip_context(xd, bsize);
- return -1;
- } else {
- if (cm->seg.enabled)
- setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
- cm->base_qindex));
+static void reconstruct_inter_block(int plane, int block,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct inter_args *args = arg;
+ VP9_COMMON *const cm = args->cm;
+ MACROBLOCKD *const xd = args->xd;
- // TODO(dkovalev) if (!vp9_reader_has_error(r))
- return vp9_decode_tokens(cm, xd, &cm->seg, r, bsize);
- }
+ *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
+ plane_bsize, tx_size, args->r);
+ inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
}
static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -385,7 +401,6 @@
vp9_reader *r, BLOCK_SIZE bsize) {
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi;
- int eobtotal;
set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
@@ -395,50 +410,68 @@
// Has to be called after set_offsets
mbmi = &xd->mi_8x8[0]->mbmi;
- eobtotal = decode_tokens(cm, xd, bsize, r);
+
+ if (mbmi->skip_coeff) {
+ reset_skip_context(xd, bsize);
+ } else {
+ if (cm->seg.enabled)
+ setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+ cm->base_qindex));
+ }
if (!is_inter_block(mbmi)) {
- // Intra reconstruction
- foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+ struct intra_args arg = { cm, xd, r };
+ foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
+ &arg);
} else {
- // Inter reconstruction
- const int decode_blocks = (eobtotal > 0);
-
- if (!less8x8) {
- assert(mbmi->sb_type == bsize);
- if (eobtotal == 0)
- mbmi->skip_coeff = 1; // skip loopfilter
- }
-
+ // Setup
set_ref(cm, xd, 0, mi_row, mi_col);
if (has_second_ref(mbmi))
set_ref(cm, xd, 1, mi_row, mi_col);
xd->subpix.filter_x = xd->subpix.filter_y =
vp9_get_filter_kernel(mbmi->interp_filter);
+
+ // Prediction
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- if (decode_blocks)
- foreach_transformed_block(xd, bsize, decode_block, xd);
+ // Reconstruction
+ if (!mbmi->skip_coeff) {
+ int eobtotal = 0;
+ struct inter_args arg = { cm, xd, r, &eobtotal };
+ foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+ if (!less8x8 && eobtotal == 0)
+ mbmi->skip_coeff = 1; // skip loopfilter
+ }
}
+
xd->corrupted |= vp9_reader_has_error(r);
}
-static PARTITION_TYPE read_partition(int hbs, int mi_rows, int mi_cols,
- int mi_row, int mi_col,
- vp9_prob probs[PARTITION_TYPES - 1],
+static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
vp9_reader *r) {
- const int has_rows = (mi_row + hbs) < mi_rows;
- const int has_cols = (mi_col + hbs) < mi_cols;
+ const int ctx = partition_plane_context(xd->above_seg_context,
+ xd->left_seg_context,
+ mi_row, mi_col, bsize);
+ const vp9_prob *const probs = get_partition_probs(cm, ctx);
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+ PARTITION_TYPE p;
if (has_rows && has_cols)
- return treed_read(r, vp9_partition_tree, probs);
+ p = treed_read(r, vp9_partition_tree, probs);
else if (!has_rows && has_cols)
- return vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+ p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
else if (has_rows && !has_cols)
- return vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+ p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
else
- return PARTITION_SPLIT;
+ p = PARTITION_SPLIT;
+
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.partition[ctx][p];
+
+ return p;
}
static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -448,19 +481,11 @@
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
- int ctx;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- ctx = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
- mi_row, mi_col, bsize);
- partition = read_partition(hbs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
- cm->fc.partition_prob[cm->frame_type][ctx], r);
-
- if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.partition[ctx][partition];
-
+ partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
subsize = get_subsize(bsize, partition);
if (subsize < BLOCK_8X8) {
decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
@@ -530,16 +555,10 @@
static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
vp9_reader *r) {
- read_coef_probs_common(fc->coef_probs[TX_4X4], r);
-
- if (tx_mode > ONLY_4X4)
- read_coef_probs_common(fc->coef_probs[TX_8X8], r);
-
- if (tx_mode > ALLOW_8X8)
- read_coef_probs_common(fc->coef_probs[TX_16X16], r);
-
- if (tx_mode > ALLOW_16X16)
- read_coef_probs_common(fc->coef_probs[TX_32X32], r);
+ const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ TX_SIZE tx_size;
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+ read_coef_probs_common(fc->coef_probs[tx_size], r);
}
static void setup_segmentation(struct segmentation *seg,
@@ -1191,7 +1210,7 @@
for (j = 0; j < PARTITION_CONTEXTS; ++j)
for (i = 0; i < PARTITION_TYPES - 1; ++i)
- vp9_diff_update_prob(&r, &fc->partition_prob[INTER_FRAME][j][i]);
+ vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
read_mv_probs(&r, nmvc, cm->allow_high_precision_mv);
}
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 0d0f0df..6ecce28 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -210,45 +210,25 @@
return c;
}
-struct decode_block_args {
- VP9_COMMON *cm;
- MACROBLOCKD *xd;
- struct segmentation *seg;
- vp9_reader *r;
- int *eobtotal;
-};
-
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *argv) {
- const struct decode_block_args* const arg = argv;
-
- // find the maximum eob for this transform size, adjusted by segment
- MACROBLOCKD *xd = arg->xd;
- const struct segmentation *seg = arg->seg;
- struct macroblockd_plane* pd = &xd->plane[plane];
- const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
- const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, vp9_reader *r) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
+ tx_size);
int aoff, loff, eob, pt;
-
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
pt = get_entropy_context(tx_size, pd->above_context + aoff,
pd->left_context + loff);
- eob = decode_coefs(arg->cm, xd, arg->r, block,
+ eob = decode_coefs(cm, xd, r, block,
pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
tx_size, pd->dequant, pt);
set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
pd->eobs[block] = eob;
- *arg->eobtotal += eob;
+ return eob;
}
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
- struct segmentation *seg,
- vp9_reader *r, BLOCK_SIZE bsize) {
- int eobtotal = 0;
- struct decode_block_args args = {cm, xd, seg, r, &eobtotal};
- foreach_transformed_block(xd, bsize, decode_block, &args);
- return eobtotal;
-}
+
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 0fb4c3c..94dd8e4 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -15,8 +15,8 @@
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_dboolhuff.h"
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
- struct segmentation *seg,
- vp9_reader *r, BLOCK_SIZE bsize);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, vp9_reader *r);
#endif // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index a996e0e..87bd36c 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -163,18 +163,13 @@
vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
}
-static void update_mode(
- vp9_writer *w,
- int n,
- vp9_tree tree,
- vp9_prob Pnew[/* n-1 */],
- vp9_prob Pcur[/* n-1 */],
- unsigned int bct[/* n-1 */] [2],
- const unsigned int num_events[/* n */]
-) {
+static void update_mode(vp9_writer *w, int n, vp9_tree tree,
+ vp9_prob Pcur[/* n-1 */],
+ unsigned int bct[/* n-1 */][2],
+ const unsigned int num_events[/* n */]) {
int i = 0;
- vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
+ vp9_tree_probs_from_distribution(tree, bct, num_events, 0);
n--;
for (i = 0; i < n; ++i)
@@ -185,11 +180,10 @@
vp9_writer* const bc) {
VP9_COMMON *const cm = &cpi->common;
int j;
- vp9_prob pnew[INTRA_MODES - 1];
unsigned int bct[INTRA_MODES - 1][2];
for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
- update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
+ update_mode(bc, INTRA_MODES, vp9_intra_mode_tree,
cm->fc.y_mode_prob[j], bct,
(unsigned int *)cpi->y_mode_count[j]);
}
@@ -231,43 +225,35 @@
write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
}
-static void update_switchable_interp_probs(VP9_COMP *const cpi,
- vp9_writer* const bc) {
+static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
VP9_COMMON *const cm = &cpi->common;
- unsigned int branch_ct[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1][2];
- vp9_prob new_prob[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1];
+ unsigned int branch_ct[SWITCHABLE_FILTERS - 1][2];
int i, j;
for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
- vp9_tree_probs_from_distribution(
- vp9_switchable_interp_tree,
- new_prob[j], branch_ct[j],
- cm->counts.switchable_interp[j], 0);
+ vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct,
+ cm->counts.switchable_interp[j], 0);
+
+ for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+ vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i],
+ branch_ct[i]);
}
- for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
- for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
- vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
- branch_ct[j][i]);
- }
- }
+
#ifdef MODE_STATS
if (!cpi->dummy_packing)
update_switchable_interp_stats(cm);
#endif
}
-static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) {
int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
unsigned int branch_ct[INTER_MODES - 1][2];
- vp9_prob new_prob[INTER_MODES - 1];
-
- vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
- new_prob, branch_ct,
+ vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct,
cm->counts.inter_mode[i], NEARESTMV);
for (j = 0; j < INTER_MODES - 1; ++j)
- vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
+ vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j],
branch_ct[j]);
}
}
@@ -592,25 +578,26 @@
pack_mb_tokens(bc, tok, tok_end);
}
-static void write_partition(PARTITION_TYPE partition,
- int hbs, int mi_rows, int mi_cols,
- int mi_row, int mi_col,
- vp9_prob probs[PARTITION_TYPES - 1],
- vp9_writer *w) {
- const int has_rows = (mi_row + hbs) < mi_rows;
- const int has_cols = (mi_col + hbs) < mi_cols;
+static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
+ PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int ctx = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
+ const vp9_prob *const probs = get_partition_probs(cm, ctx);
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
if (has_rows && has_cols) {
- write_token(w, vp9_partition_tree, probs,
- &vp9_partition_encodings[partition]);
+ write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]);
} else if (!has_rows && has_cols) {
- assert(partition == PARTITION_SPLIT || partition == PARTITION_HORZ);
- vp9_write(w, partition == PARTITION_SPLIT, probs[1]);
+ assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+ vp9_write(w, p == PARTITION_SPLIT, probs[1]);
} else if (has_rows && !has_cols) {
- assert(partition == PARTITION_SPLIT || partition == PARTITION_VERT);
- vp9_write(w, partition == PARTITION_SPLIT, probs[2]);
+ assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+ vp9_write(w, p == PARTITION_SPLIT, probs[2]);
} else {
- assert(partition == PARTITION_SPLIT);
+ assert(p == PARTITION_SPLIT);
}
}
@@ -637,11 +624,7 @@
if (index > 0)
return;
} else {
- const int ctx = partition_plane_context(cpi->above_seg_context,
- cpi->left_seg_context,
- mi_row, mi_col, bsize);
- write_partition(partition, bs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
- cm->fc.partition_prob[cm->frame_type][ctx], bc);
+ write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc);
}
subsize = get_subsize(bsize, partition);
@@ -710,8 +693,7 @@
unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
cpi->common.counts.eob_branch[tx_size];
vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
- vp9_prob full_probs[ENTROPY_NODES];
- int i, j, k, l;
+ int i, j, k, l, m;
for (i = 0; i < BLOCK_TYPES; ++i) {
for (j = 0; j < REF_TYPES; ++j) {
@@ -720,16 +702,14 @@
if (l >= 3 && k == 0)
continue;
vp9_tree_probs_from_distribution(vp9_coef_tree,
- full_probs,
coef_branch_ct[i][j][k][l],
coef_counts[i][j][k][l], 0);
- vpx_memcpy(coef_probs[i][j][k][l], full_probs,
- sizeof(vp9_prob) * UNCONSTRAINED_NODES);
coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
coef_branch_ct[i][j][k][l][0][0];
- coef_probs[i][j][k][l][0] =
- get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
- coef_branch_ct[i][j][k][l][0][1]);
+ for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ coef_probs[i][j][k][l][m] = get_binary_prob(
+ coef_branch_ct[i][j][k][l][m][0],
+ coef_branch_ct[i][j][k][l][m][1]);
#ifdef ENTROPY_STATS
if (!cpi->dummy_packing) {
int t;
@@ -1467,11 +1447,9 @@
update_mbintra_mode_probs(cpi, &header_bc);
for (i = 0; i < PARTITION_CONTEXTS; ++i) {
- vp9_prob pnew[PARTITION_TYPES - 1];
unsigned int bct[PARTITION_TYPES - 1][2];
- update_mode(&header_bc, PARTITION_TYPES,
- vp9_partition_tree, pnew,
- fc->partition_prob[cm->frame_type][i], bct,
+ update_mode(&header_bc, PARTITION_TYPES, vp9_partition_tree,
+ fc->partition_prob[i], bct,
(unsigned int *)cpi->partition_count[i]);
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 44ade18..86332bc 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -683,10 +683,6 @@
[mbmi->ref_frame[0] != GOLDEN_FRAME]++;
}
}
-
- // Count of last ref frame 0,0 usage
- if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
- cpi->inter_zz_count++;
}
}
@@ -1931,9 +1927,6 @@
totalrate = 0;
- // Reset frame count of inter 0,0 motion vector usage.
- cpi->inter_zz_count = 0;
-
vp9_zero(cm->counts.switchable_interp);
vp9_zero(cpi->tx_stepdown_count);
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 9ebcc49..e2c6c4c 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -124,8 +124,9 @@
}
}
-static int update_mv(vp9_writer *w, const unsigned int ct[2],
- vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
+static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
+ vp9_prob upd_p) {
+ const vp9_prob new_p = get_binary_prob(ct[0], ct[1]);
vp9_prob mod_p = new_p | 1;
const int cur_b = cost_branch256(ct, *cur_p);
const int mod_b = cost_branch256(ct, mod_p);
@@ -143,7 +144,6 @@
static void counts_to_nmv_context(
nmv_context_counts *nmv_count,
- nmv_context *prob,
int usehp,
unsigned int (*branch_ct_joint)[2],
unsigned int (*branch_ct_sign)[2],
@@ -156,29 +156,24 @@
unsigned int (*branch_ct_hp)[2]) {
int i, j, k;
vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
- prob->joints,
branch_ct_joint,
nmv_count->joints, 0);
for (i = 0; i < 2; ++i) {
const uint32_t s0 = nmv_count->comps[i].sign[0];
const uint32_t s1 = nmv_count->comps[i].sign[1];
- prob->comps[i].sign = get_binary_prob(s0, s1);
branch_ct_sign[i][0] = s0;
branch_ct_sign[i][1] = s1;
vp9_tree_probs_from_distribution(vp9_mv_class_tree,
- prob->comps[i].classes,
- branch_ct_classes[i],
- nmv_count->comps[i].classes, 0);
+ branch_ct_classes[i],
+ nmv_count->comps[i].classes, 0);
vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
- prob->comps[i].class0,
branch_ct_class0[i],
nmv_count->comps[i].class0, 0);
for (j = 0; j < MV_OFFSET_BITS; ++j) {
const uint32_t b0 = nmv_count->comps[i].bits[j][0];
const uint32_t b1 = nmv_count->comps[i].bits[j][1];
- prob->comps[i].bits[j] = get_binary_prob(b0, b1);
branch_ct_bits[i][j][0] = b0;
branch_ct_bits[i][j][1] = b1;
}
@@ -186,12 +181,10 @@
for (i = 0; i < 2; ++i) {
for (k = 0; k < CLASS0_SIZE; ++k) {
vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].class0_fp[k],
branch_ct_class0_fp[i][k],
nmv_count->comps[i].class0_fp[k], 0);
}
vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].fp,
branch_ct_fp[i],
nmv_count->comps[i].fp, 0);
}
@@ -202,11 +195,9 @@
const uint32_t hp0 = nmv_count->comps[i].hp[0];
const uint32_t hp1 = nmv_count->comps[i].hp[1];
- prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
branch_ct_class0_hp[i][0] = c0_hp0;
branch_ct_class0_hp[i][1] = c0_hp1;
- prob->comps[i].hp = get_binary_prob(hp0, hp1);
branch_ct_hp[i][0] = hp0;
branch_ct_hp[i][1] = hp1;
}
@@ -215,7 +206,6 @@
void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
int i, j;
- nmv_context prob;
unsigned int branch_ct_joint[MV_JOINTS - 1][2];
unsigned int branch_ct_sign[2][2];
unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
@@ -227,30 +217,28 @@
unsigned int branch_ct_hp[2][2];
nmv_context *mvc = &cpi->common.fc.nmvc;
- counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+ counts_to_nmv_context(&cpi->NMVcount, usehp,
branch_ct_joint, branch_ct_sign, branch_ct_classes,
branch_ct_class0, branch_ct_bits,
branch_ct_class0_fp, branch_ct_fp,
branch_ct_class0_hp, branch_ct_hp);
for (j = 0; j < MV_JOINTS - 1; ++j)
- update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
- NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB);
for (i = 0; i < 2; ++i) {
- update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
- prob.comps[i].sign, NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB);
for (j = 0; j < MV_CLASSES - 1; ++j)
update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
- prob.comps[i].classes[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
for (j = 0; j < CLASS0_SIZE - 1; ++j)
update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
- prob.comps[i].class0[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
for (j = 0; j < MV_OFFSET_BITS; ++j)
update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
- prob.comps[i].bits[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
}
for (i = 0; i < 2; ++i) {
@@ -258,21 +246,19 @@
int k;
for (k = 0; k < 3; ++k)
update_mv(bc, branch_ct_class0_fp[i][j][k],
- &mvc->comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
+ &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
}
for (j = 0; j < 3; ++j)
- update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
- prob.comps[i].fp[j], NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB);
}
if (usehp) {
for (i = 0; i < 2; ++i) {
update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
- prob.comps[i].class0_hp, NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
- prob.comps[i].hp, NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
}
}
}
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 9f3da27..1d3170a 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1032,11 +1032,6 @@
CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
}
- // Data used for real time vc mode to see if gf needs refreshing
- cpi->inter_zz_count = 0;
- cpi->gf_bad_count = 0;
- cpi->gf_update_recommended = 0;
-
vpx_free(cpi->mb_activity_map);
CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
vpx_calloc(sizeof(unsigned int),
@@ -1185,7 +1180,6 @@
int i;
cpi->oxcf = *oxcf;
- cpi->goldfreq = 7;
cm->version = oxcf->version;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 1ec2eaf..9429c7f 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -501,14 +501,9 @@
int decimation_count;
// for real time encoding
- int avg_encode_time; // microsecond
- int avg_pick_mode_time; // microsecond
int speed;
- unsigned int cpu_freq; // Mhz
int compressor_speed;
- int interquantizer;
- int goldfreq;
int auto_worst_q;
int cpu_used;
int pass;
@@ -524,12 +519,6 @@
unsigned int max_mv_magnitude;
int mv_step_param;
- // Data used for real time conferencing mode to help determine if it
- // would be good to update the gf
- int inter_zz_count;
- int gf_bad_count;
- int gf_update_recommended;
-
unsigned char *segmentation_map;
// segment threashold for encode breakout
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9d2624e..993919e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -251,8 +251,7 @@
fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
for (i = 0; i < PARTITION_CONTEXTS; i++)
- vp9_cost_tokens(cpi->mb.partition_cost[i],
- cm->fc.partition_prob[cm->frame_type][i],
+ vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
vp9_partition_tree);
/*rough estimate for costing*/
@@ -1093,7 +1092,7 @@
else
x->fwd_txm4x4(src_diff, coeff, 8);
- vp9_regular_quantize_b_4x4(x, 16, block, scan, get_iscan_4x4(tx_type));
+ vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type));
ratey += cost_coeffs(x, 0, block,
tempa + idx, templ + idy, TX_4X4, scan, nb);
@@ -1560,7 +1559,7 @@
coeff = BLOCK_OFFSET(p->coeff, k);
x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
- vp9_regular_quantize_b_4x4(x, 16, k, get_scan_4x4(DCT_DCT),
+ vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT),
get_iscan_4x4(DCT_DCT));
thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
16, &ssz);
@@ -1872,12 +1871,14 @@
mi_buf_restore(x, orig_src, orig_pre);
}
- if (has_second_rf && this_mode == NEWMV &&
- mbmi->interp_filter == EIGHTTAP) {
+ if (has_second_rf) {
if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
continue;
+ }
+ if (has_second_rf && this_mode == NEWMV &&
+ mbmi->interp_filter == EIGHTTAP) {
// adjust src pointers
mi_buf_shift(x, i);
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
@@ -2283,12 +2284,8 @@
// set up scaling factors
scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
- scale[frame_type].x_offset_q4 =
- ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].sfc->x_scale_fp,
- REF_SCALE_SHIFT) & 0xf;
- scale[frame_type].y_offset_q4 =
- ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].sfc->y_scale_fp,
- REF_SCALE_SHIFT) & 0xf;
+ scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
+ mi_row * MI_SIZE, mi_col * MI_SIZE);
// TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
// use the UV scaling factors.
@@ -2665,6 +2662,12 @@
int orig_dst_stride[MAX_MB_PLANE];
int rs = 0;
+ if (is_comp_pred) {
+ if (frame_mv[refs[0]].as_int == INVALID_MV ||
+ frame_mv[refs[1]].as_int == INVALID_MV)
+ return INT64_MAX;
+ }
+
if (this_mode == NEWMV) {
int rate_mv;
if (is_comp_pred) {
@@ -2683,9 +2686,6 @@
&mbmi->ref_mvs[refs[1]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
- if (frame_mv[refs[0]].as_int == INVALID_MV ||
- frame_mv[refs[1]].as_int == INVALID_MV)
- return INT64_MAX;
*rate2 += rate_mv;
} else {
int_mv tmp_mv;
@@ -3541,17 +3541,16 @@
}
// Keep record of best intra rd
- if (xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME &&
- is_intra_mode(xd->mi_8x8[0]->mbmi.mode) &&
+ if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
this_rd < best_intra_rd) {
best_intra_rd = this_rd;
best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
}
+
// Keep record of best inter rd with single reference
- if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
- xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
- !mode_excluded &&
- this_rd < best_inter_rd) {
+ if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+ !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
+ !mode_excluded && this_rd < best_inter_rd) {
best_inter_rd = this_rd;
best_inter_ref_frame = ref_frame;
}
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index eb864d9..387fc90 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -221,7 +221,7 @@
}
void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
- unsigned int *ct) {
+ const unsigned int ct[2]) {
const vp9_prob upd = DIFF_UPDATE_PROB;
vp9_prob newp = get_binary_prob(ct[0], ct[1]);
const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 11fa2e0..0badb08 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,6 +74,7 @@
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm