Merge "Fix vp9_build_intra_predictors_sbuv_s for non-4:2:0" into experimental
diff --git a/configure b/configure
index 5cbf070..cc8c581 100755
--- a/configure
+++ b/configure
@@ -247,6 +247,7 @@
multiple_arf
non420
ab4x4
+ comp_inter_joint_search
"
CONFIG_LIST="
external_build
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index f6d2d59..9fb45d6 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -17,6 +17,7 @@
extern "C" {
#include "vp9/common/vp9_entropy.h"
#include "vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
}
#include "acm_random.h"
@@ -269,19 +270,23 @@
const int count_test_block = 1000;
for (int i = 0; i < count_test_block; ++i) {
int16_t in[256], coeff[256];
- int16_t out_c[256];
+ uint8_t dst[256], src[256];
double out_r[256];
+ for (int j = 0; j < 256; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j)
- in[j] = rnd.Rand8() - rnd.Rand8();
+ in[j] = src[j] - dst[j];
reference_16x16_dct_2d(in, out_r);
for (int j = 0; j < 256; j++)
coeff[j] = round(out_r[j]);
- vp9_short_idct16x16_c(coeff, out_c, 32);
+ vp9_short_idct16x16_add_c(coeff, dst, 16);
for (int j = 0; j < 256; ++j) {
- const int diff = out_c[j] - in[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
<< "Error: 16x16 IDCT has error " << error
@@ -289,7 +294,7 @@
}
}
}
-#if 1
+
// we need enable fdct test once we re-do the 16 point fdct.
TEST(VP9Fdct16x16Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -299,18 +304,22 @@
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[256];
int16_t test_temp_block[256];
- int16_t test_output_block[256];
+ uint8_t dst[256], src[256];
+ for (int j = 0; j < 256; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j)
- test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
const int pitch = 32;
vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);
+ vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
for (int j = 0; j < 256; ++j) {
- const int diff = test_input_block[j] - test_output_block[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
@@ -354,6 +363,4 @@
}
}
}
-#endif
-
} // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index a565270..e05d482 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -18,7 +18,7 @@
#include "vp9/common/vp9_entropy.h"
#include "./vp9_rtcd.h"
void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
- void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+ void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
}
#include "test/acm_random.h"
@@ -91,28 +91,31 @@
}
}
-
TEST(VP9Idct32x32Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
for (int i = 0; i < count_test_block; ++i) {
int16_t in[1024], coeff[1024];
- int16_t out_c[1024];
+ uint8_t dst[1024], src[1024];
double out_r[1024];
+ for (int j = 0; j < 1024; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 1024; ++j)
- in[j] = rnd.Rand8() - rnd.Rand8();
+ in[j] = src[j] - dst[j];
reference_32x32_dct_2d(in, out_r);
for (int j = 0; j < 1024; j++)
coeff[j] = round(out_r[j]);
- vp9_short_idct32x32_c(coeff, out_c, 64);
+ vp9_short_idct32x32_add_c(coeff, dst, 32);
for (int j = 0; j < 1024; ++j) {
- const int diff = out_c[j] - in[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
- << "Error: 3x32 IDCT has error " << error
+ << "Error: 32x32 IDCT has error " << error
<< " at index " << j;
}
}
@@ -126,18 +129,22 @@
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[1024];
int16_t test_temp_block[1024];
- int16_t test_output_block[1024];
+ uint8_t dst[1024], src[1024];
+ for (int j = 0; j < 1024; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 1024; ++j)
- test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
const int pitch = 64;
vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+ vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
for (int j = 0; j < 1024; ++j) {
- const unsigned diff = test_input_block[j] - test_output_block[j];
+ const unsigned diff = dst[j] - src[j];
const unsigned error = diff * diff;
if (max_error < error)
max_error = error;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index ab9e28d..07607d8 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -70,17 +70,17 @@
} INTERPOLATIONFILTERTYPE;
typedef enum {
- DC_PRED, /* average of above and left pixels */
- V_PRED, /* vertical prediction */
- H_PRED, /* horizontal prediction */
- D45_PRED, /* Directional 45 deg prediction [anti-clockwise from 0 deg hor] */
- D135_PRED, /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
- D117_PRED, /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
- D153_PRED, /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
- D27_PRED, /* Directional 22 deg prediction [anti-clockwise from 0 deg hor] */
- D63_PRED, /* Directional 67 deg prediction [anti-clockwise from 0 deg hor] */
- TM_PRED, /* Truemotion prediction */
- I4X4_PRED, /* 4x4 based prediction, each 4x4 has its own mode */
+ DC_PRED, // Average of above and left pixels
+ V_PRED, // Vertical
+ H_PRED, // Horizontal
+ D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi)
+ D135_PRED, // Directional 135 deg = 180 - 45
+ D117_PRED, // Directional 117 deg = 180 - 63
+ D153_PRED, // Directional 153 deg = 180 - 27
+ D27_PRED, // Directional 27 deg = round(arctan(1/2) * 180/pi)
+ D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi)
+ TM_PRED, // True-motion
+ I4X4_PRED, // Each 4x4 subblock has its own mode
NEARESTMV,
NEARMV,
ZEROMV,
@@ -222,12 +222,21 @@
static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
int a = b_width_log2(sb_type) - 1;
+#if CONFIG_AB4X4
+ // align 4x4 block to mode_info
+ if (a < 0)
+ a = 0;
+#endif
assert(a >= 0);
return a;
}
static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
int a = b_height_log2(sb_type) - 1;
+#if CONFIG_AB4X4
+ if (a < 0)
+ a = 0;
+#endif
assert(a >= 0);
return a;
}
@@ -401,10 +410,39 @@
int sb_index; // index of 32x32 block inside the 64x64 block
int mb_index; // index of 16x16 block inside the 32x32 block
int b_index; // index of 8x8 block inside the 16x16 block
+#if CONFIG_AB4X4
+ int ab_index; // index of 4x4 block inside the 8x8 block
+#endif
int q_index;
} MACROBLOCKD;
+static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+ switch (subsize) {
+ case BLOCK_SIZE_SB64X32:
+ case BLOCK_SIZE_SB32X64:
+ case BLOCK_SIZE_SB32X32:
+ return &xd->sb_index;
+ case BLOCK_SIZE_SB32X16:
+ case BLOCK_SIZE_SB16X32:
+ case BLOCK_SIZE_MB16X16:
+ return &xd->mb_index;
+ case BLOCK_SIZE_SB16X8:
+ case BLOCK_SIZE_SB8X16:
+ case BLOCK_SIZE_SB8X8:
+ return &xd->b_index;
+#if CONFIG_AB4X4
+ case BLOCK_SIZE_SB8X4:
+ case BLOCK_SIZE_SB4X8:
+ case BLOCK_SIZE_AB4X4:
+ return &xd->ab_index;
+#endif
+ default:
+ assert(0);
+ return NULL;
+ }
+}
+
static INLINE void update_partition_context(MACROBLOCKD *xd,
BLOCK_SIZE_TYPE sb_type,
BLOCK_SIZE_TYPE sb_size) {
@@ -413,9 +451,12 @@
int bhl = mi_height_log2(sb_type);
int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
int i;
- // skip macroblock partition
+
+#if !CONFIG_AB4X4
+ // skip 8x8 block partition
if (bsl == 0)
return;
+#endif
// update the partition context at the end notes. set partition bits
// of block sizes larger than the current one to be one, and partition
@@ -463,7 +504,11 @@
above = (above > 0);
left = (left > 0);
+#if CONFIG_AB4X4
+ return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+#else
return (left * 2 + above) + (bsl - 1) * PARTITION_PLOFFSET;
+#endif
}
static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
@@ -480,6 +525,10 @@
subsize = BLOCK_SIZE_SB32X16;
else if (bsize == BLOCK_SIZE_MB16X16)
subsize = BLOCK_SIZE_SB16X8;
+#if CONFIG_AB4X4
+ else if (bsize == BLOCK_SIZE_SB8X8)
+ subsize = BLOCK_SIZE_SB8X4;
+#endif
else
assert(0);
break;
@@ -490,6 +539,10 @@
subsize = BLOCK_SIZE_SB16X32;
else if (bsize == BLOCK_SIZE_MB16X16)
subsize = BLOCK_SIZE_SB8X16;
+#if CONFIG_AB4X4
+ else if (bsize == BLOCK_SIZE_SB8X8)
+ subsize = BLOCK_SIZE_SB4X8;
+#endif
else
assert(0);
break;
@@ -500,6 +553,10 @@
subsize = BLOCK_SIZE_MB16X16;
else if (bsize == BLOCK_SIZE_MB16X16)
subsize = BLOCK_SIZE_SB8X8;
+#if CONFIG_AB4X4
+ else if (bsize == BLOCK_SIZE_SB8X8)
+ subsize = BLOCK_SIZE_AB4X4;
+#endif
else
assert(0);
break;
@@ -543,6 +600,7 @@
case B_V_PRED :
case B_D117_PRED :
+ case B_D63_PRED:
return ADST_DCT;
case B_H_PRED :
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index dbfb9ed..b6252d9 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -52,6 +52,10 @@
return value < low ? low : (value > high ? high : value);
}
+static INLINE double fclamp(double value, double low, double high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
static INLINE int multiple16(int value) {
return (value + 15) & ~15;
}
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index aef6871..532e5d3 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -46,6 +46,13 @@
5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5
};
+
+DECLARE_ALIGNED(16, const uint8_t,
+ vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {
+ 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 5
+};
+
DECLARE_ALIGNED(16, const int, vp9_coef_bands4x4[16]) = {
0, 1, 2, 3,
1, 2, 3, 4,
@@ -53,6 +60,12 @@
3, 4, 5, 5
};
+DECLARE_ALIGNED(16, const uint8_t,
+ vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {
+ 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 5
+};
+
DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
};
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 579313f..9352bf6 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -133,20 +133,20 @@
extern const int vp9_coef_bands8x8[64];
extern const int vp9_coef_bands4x4[16];
+extern const uint8_t vp9_coefband_trans_8x8plus[22];
+extern const uint8_t vp9_coefband_trans_4x4[22];
-static int get_coef_band(const int *scan, TX_SIZE tx_size, int coef_index) {
- if (tx_size == TX_4X4) {
- return vp9_coef_bands4x4[scan[coef_index]];
- } else {
- const int pos = scan[coef_index];
- const int sz = 1 << (2 + tx_size);
- const int x = pos & (sz - 1), y = pos >> (2 + tx_size);
- if (x >= 8 || y >= 8)
- return 5;
- else
- return vp9_coef_bands8x8[y * 8 + x];
- }
+// This is the index in the scan order beyond which all coefficients for
+// 8x8 transform and above are in the top band.
+// For 4x4 blocks the index is less but to keep things common the lookup
+// table for 4x4 is padded out to this index.
+#define MAXBAND_INDEX 21
+
+static int get_coef_band(const uint8_t * band_translate, int coef_index) {
+ return (coef_index > MAXBAND_INDEX)
+ ? (COEF_BANDS-1) : band_translate[coef_index];
}
+
extern int vp9_get_coef_context(const int *scan, const int *neighbors,
int nb_pad, uint8_t *token_cache, int c, int l);
const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index dcee62f..577aab5 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -106,6 +106,12 @@
const vp9_prob vp9_partition_probs[NUM_PARTITION_CONTEXTS]
[PARTITION_TYPES - 1] = {
// FIXME(jingning,rbultje) put real probabilities here
+#if CONFIG_AB4X4
+ {202, 162, 107},
+ {16, 2, 169},
+ {3, 246, 19},
+ {104, 90, 134},
+#endif
{202, 162, 107},
{16, 2, 169},
{3, 246, 19},
@@ -513,6 +519,7 @@
vp9_sub_mv_ref_tree, fc->sub_mv_ref_counts[i],
fc->pre_sub_mv_ref_prob[i], fc->sub_mv_ref_prob[i],
LEFT4X4);
+
for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
fc->partition_counts[i], fc->pre_partition_prob[i],
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 1663195..626f0b9 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -36,6 +36,7 @@
BLOCK_SIZE_SB32X64,
BLOCK_SIZE_SB64X32,
BLOCK_SIZE_SB64X64,
+ BLOCK_SIZE_TYPES
} BLOCK_SIZE_TYPE;
typedef enum PARTITION_TYPE {
@@ -47,6 +48,10 @@
} PARTITION_TYPE;
#define PARTITION_PLOFFSET 4 // number of probability models per block size
+#if CONFIG_AB4X4
+#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+#else
#define NUM_PARTITION_CONTEXTS (3 * PARTITION_PLOFFSET)
+#endif
#endif // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 3ec093f..b166fcb 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -621,10 +621,9 @@
output[15] = step2[0] - step2[15];
}
-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[16 * 16];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[16], temp_out[16];
@@ -641,7 +640,8 @@
temp_in[j] = out[j * 16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]);
}
}
@@ -823,8 +823,8 @@
{ iadst16_1d, iadst16_1d } // ADST_ADST = 3
};
-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
- int pitch, int tx_type) {
+void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+ int tx_type) {
int i, j;
int16_t out[16 * 16];
int16_t *outptr = out;
@@ -844,38 +844,38 @@
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]); }
+}
+
+void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t out[16 * 16];
+ int16_t *outptr = out;
+ int i, j;
+ int16_t temp_in[16], temp_out[16];
+
+ /* First transform rows. Since all non-zero dct coefficients are in
+ * upper-left 4x4 area, we only need to calculate first 4 rows here.
+ */
+ vpx_memset(out, 0, sizeof(out));
+ for (i = 0; i < 4; ++i) {
+ idct16_1d(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ idct16_1d(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]);
}
}
-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
- int16_t out[16 * 16];
- int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
- int i, j;
- int16_t temp_in[16], temp_out[16];
-
- /* First transform rows. Since all non-zero dct coefficients are in
- * upper-left 4x4 area, we only need to calculate first 4 rows here.
- */
- vpx_memset(out, 0, sizeof(out));
- for (i = 0; i < 4; ++i) {
- idct16_1d(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j*16 + i];
- idct16_1d(temp_in, temp_out);
- for (j = 0; j < 16; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
- }
-}
-
-
void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
@@ -1249,10 +1249,9 @@
output[31] = step1[0] - step1[31];
}
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[32 * 32];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[32], temp_out[32];
@@ -1269,7 +1268,8 @@
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]);
}
}
@@ -1279,10 +1279,10 @@
output[0] = ROUND_POWER_OF_TWO(out, 6);
}
-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
+ int dest_stride) {
int16_t out[32 * 32];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[32], temp_out[32];
@@ -1302,6 +1302,7 @@
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]);
}
}
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 80fccd5..589984f 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -49,9 +49,6 @@
void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
int ystride, int uv_stride, struct loop_filter_info *lfi)
-#define prototype_simple_loopfilter(sym) \
- void sym(uint8_t *y, int ystride, const unsigned char *blimit)
-
#if ARCH_X86 || ARCH_X86_64
#include "x86/vp9_loopfilter_x86.h"
#endif
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 15785f5..fc7fbc4 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -8,15 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <stdlib.h>
#include "vpx_config.h"
+#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
static INLINE int8_t signed_char_clamp(int t) {
- t = (t < -128 ? -128 : t);
- t = (t > 127 ? 127 : t);
- return (int8_t) t;
+ return (int8_t)clamp(t, -128, 127);
}
// should we apply any filter at all: 11111111 yes, 00000000 no
@@ -36,7 +34,7 @@
return ~mask;
}
-// is there high variance internal edge: 11111111 yes, 00000000 no
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
uint8_t q0, uint8_t q1) {
int8_t hev = 0;
@@ -68,12 +66,9 @@
*oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
*op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
- filter = filter1;
// outer tap adjustments
- filter += 1;
- filter >>= 1;
- filter &= ~hev;
+ filter = ((filter1 + 1) >> 1) & ~hev;
*oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
*op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
@@ -84,23 +79,19 @@
const uint8_t *limit,
const uint8_t *thresh,
int count) {
- int i = 0;
+ int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
- do {
- const int8_t mask = filter_mask(limit[0], blimit[0],
- s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
- s[0 * p], s[1 * p], s[2 * p], s[3 * p]);
-
- // high edge variance
- const int8_t hev = hevmask(thresh[0],
- s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
+ for (i = 0; i < 8 * count; ++i) {
+ const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
-
++s;
- } while (++i < count * 8);
+ }
}
void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
@@ -108,21 +99,21 @@
const uint8_t *limit,
const uint8_t *thresh,
int count) {
- int i = 0;
+ int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
- do {
- const int8_t mask = filter_mask(limit[0], blimit[0],
- s[-4], s[-3], s[-2], s[-1],
- s[0], s[1], s[2], s[3]);
-
- // high edge variance
- const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+ for (i = 0; i < 8 * count; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
filter(mask, hev, s - 2, s - 1, s, s + 1);
s += pitch;
- } while (++i < count * 8);
+ }
}
+
static INLINE int8_t flatmask4(uint8_t thresh,
uint8_t p3, uint8_t p2,
uint8_t p1, uint8_t p0,
@@ -157,14 +148,8 @@
uint8_t *oq2, uint8_t *oq3) {
// use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line
if (flat && mask) {
- const uint8_t p3 = *op3;
- const uint8_t p2 = *op2;
- const uint8_t p1 = *op1;
- const uint8_t p0 = *op0;
- const uint8_t q0 = *oq0;
- const uint8_t q1 = *oq1;
- const uint8_t q2 = *oq2;
- const uint8_t q3 = *oq3;
+ const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
*op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
*op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
@@ -173,33 +158,7 @@
*oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
*oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
} else {
- int8_t filter1, filter2;
-
- const int8_t ps1 = (int8_t) *op1 ^ 0x80;
- const int8_t ps0 = (int8_t) *op0 ^ 0x80;
- const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
- const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
-
- // add outer taps if we have high edge variance
- int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
- // inner taps
- filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
- filter1 = signed_char_clamp(filter + 4) >> 3;
- filter2 = signed_char_clamp(filter + 3) >> 3;
-
- *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
- *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
- filter = filter1;
-
- // outer tap adjustments
- filter += 1;
- filter >>= 1;
- filter &= ~hev;
-
- *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
- *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+ filter(mask, hev, op1, op0, oq0, oq1);
}
}
@@ -208,28 +167,23 @@
const uint8_t *limit,
const uint8_t *thresh,
int count) {
- int i = 0;
+ int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
- do {
- const int8_t mask = filter_mask(limit[0], blimit[0],
- s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
- s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+ for (i = 0; i < 8 * count; ++i) {
+ const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
- const int8_t hev = hevmask(thresh[0],
- s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
- const int8_t flat = flatmask4(1,
- s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
- s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+ const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
mbfilter(mask, hev, flat,
s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
s, s + 1 * p, s + 2 * p, s + 3 * p);
-
++s;
- } while (++i < count * 8);
-
+ }
}
void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
@@ -237,72 +191,19 @@
const uint8_t *limit,
const uint8_t *thresh,
int count) {
- int i = 0;
+ int i;
- do {
- const int8_t mask = filter_mask(limit[0], blimit[0],
- s[-4], s[-3], s[-2], s[-1],
- s[0], s[1], s[2], s[3]);
-
- const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
- const int8_t flat = flatmask4(1, s[-4], s[-3], s[-2], s[-1],
- s[ 0], s[ 1], s[ 2], s[ 3]);
+ for (i = 0; i < 8 * count; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1);
+ const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
s, s + 1, s + 2, s + 3);
s += pitch;
- } while (++i < count * 8);
-
-}
-
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t simple_filter_mask(uint8_t blimit,
- uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1) {
- return (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
-}
-
-static INLINE void simple_filter(int8_t mask,
- uint8_t *op1, uint8_t *op0,
- uint8_t *oq0, uint8_t *oq1) {
- int8_t filter1, filter2;
- const int8_t p1 = (int8_t) *op1 ^ 0x80;
- const int8_t p0 = (int8_t) *op0 ^ 0x80;
- const int8_t q0 = (int8_t) *oq0 ^ 0x80;
- const int8_t q1 = (int8_t) *oq1 ^ 0x80;
-
- int8_t filter = signed_char_clamp(p1 - q1);
- filter = signed_char_clamp(filter + 3 * (q0 - p0));
- filter &= mask;
-
- // save bottom 3 bits so that we round one side +4 and the other +3
- filter1 = signed_char_clamp(filter + 4) >> 3;
- *oq0 = signed_char_clamp(q0 - filter1) ^ 0x80;
-
- filter2 = signed_char_clamp(filter + 3) >> 3;
- *op0 = signed_char_clamp(p0 + filter2) ^ 0x80;
-}
-
-void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s, int p,
- const uint8_t *blimit) {
- int i = 0;
-
- do {
- const int8_t mask = simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p],
- s[0 * p], s[1 * p]);
- simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
- ++s;
- } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s, int p,
- const uint8_t *blimit) {
- int i = 0;
-
- do {
- const int8_t mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
- simple_filter(mask, s - 2, s - 1, s, s + 1);
- s += p;
- } while (++i < 16);
+ }
}
/* Vertical MB Filtering */
@@ -392,11 +293,6 @@
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp9_loop_filter_bhs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
- vp9_loop_filter_simple_horizontal_edge_c(y + 4 * y_stride, y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_c(y + 8 * y_stride, y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_c(y + 12 * y_stride, y_stride, blimit);
-}
void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
int y_stride, int uv_stride,
@@ -413,12 +309,6 @@
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp9_loop_filter_bvs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
- vp9_loop_filter_simple_vertical_edge_c(y + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_c(y + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_c(y + 12, y_stride, blimit);
-}
-
static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
uint8_t flat, uint8_t flat2,
uint8_t *op7, uint8_t *op6, uint8_t *op5,
@@ -429,22 +319,11 @@
uint8_t *oq7) {
// use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line
if (flat2 && flat && mask) {
- const uint8_t p7 = *op7;
- const uint8_t p6 = *op6;
- const uint8_t p5 = *op5;
- const uint8_t p4 = *op4;
- const uint8_t p3 = *op3;
- const uint8_t p2 = *op2;
- const uint8_t p1 = *op1;
- const uint8_t p0 = *op0;
- const uint8_t q0 = *oq0;
- const uint8_t q1 = *oq1;
- const uint8_t q2 = *oq2;
- const uint8_t q3 = *oq3;
- const uint8_t q4 = *oq4;
- const uint8_t q5 = *oq5;
- const uint8_t q6 = *oq6;
- const uint8_t q7 = *oq7;
+ const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
+ p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
+ q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
*op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
q0, 4);
@@ -474,49 +353,8 @@
q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
*oq6 = ROUND_POWER_OF_TWO(p0 +
q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
- } else if (flat && mask) {
- const uint8_t p3 = *op3;
- const uint8_t p2 = *op2;
- const uint8_t p1 = *op1;
- const uint8_t p0 = *op0;
- const uint8_t q0 = *oq0;
- const uint8_t q1 = *oq1;
- const uint8_t q2 = *oq2;
- const uint8_t q3 = *oq3;
-
- *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
- *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
- *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3);
- *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3);
- *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
- *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
} else {
- int8_t filter1, filter2;
-
- const int8_t ps1 = (int8_t) * op1 ^ 0x80;
- const int8_t ps0 = (int8_t) * op0 ^ 0x80;
- const int8_t qs0 = (int8_t) * oq0 ^ 0x80;
- const int8_t qs1 = (int8_t) * oq1 ^ 0x80;
-
- // add outer taps if we have high edge variance
- int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
- // inner taps
- filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
- filter1 = signed_char_clamp(filter + 4) >> 3;
- filter2 = signed_char_clamp(filter + 3) >> 3;
-
- *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
- *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
- filter = filter1;
-
- // outer tap adjustments
- filter += 1;
- filter >>= 1;
- filter &= ~hev;
-
- *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
- *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+ mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
}
}
@@ -525,25 +363,20 @@
const uint8_t *limit,
const uint8_t *thresh,
int count) {
- int i = 0;
+ int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
- do {
- const int8_t mask = filter_mask(limit[0], blimit[0],
- s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
- s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
- const int8_t hev = hevmask(thresh[0],
- s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
- const int8_t flat = flatmask4(1,
- s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
- s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
+ for (i = 0; i < 8 * count; ++i) {
+ const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+ const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat2 = flatmask5(1,
- s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],
- s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);
+ s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+ q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
wide_mbfilter(mask, hev, flat, flat2,
s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
@@ -552,33 +385,31 @@
s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
++s;
- } while (++i < count * 8);
+ }
}
+
void vp9_mb_lpf_vertical_edge_w(uint8_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh,
int count) {
- int i = 0;
+ int i;
- do {
- const int8_t mask = filter_mask(limit[0], blimit[0],
- s[-4], s[-3], s[-2], s[-1],
- s[0], s[1], s[2], s[3]);
-
- const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
- const int8_t flat = flatmask4(1, s[-4], s[-3], s[-2], s[-1],
- s[ 0], s[ 1], s[ 2], s[ 3]);
- const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], s[-1],
- s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);
+ for (i = 0; i < 8 * count; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+ const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+ q0, s[4], s[5], s[6], s[7]);
wide_mbfilter(mask, hev, flat, flat2,
- s - 8, s - 7, s - 6, s - 5,
- s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3,
- s + 4, s + 5, s + 6, s + 7);
+ s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+ s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
s += p;
- } while (++i < count * 8);
+ }
}
void vp9_lpf_mbv_w_c(uint8_t *y, uint8_t *u, uint8_t *v,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 57f9978..2d4cd30 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -306,6 +306,13 @@
return 2 * ((cm->mb_cols + 3) & ~3);
}
+static INLINE void set_partition_seg_context(VP9_COMMON *cm,
+ MACROBLOCKD *xd,
+ int mi_row, int mi_col) {
+ xd->above_seg_context = cm->above_seg_context + mi_col;
+ xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+}
+
static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int bh,
int mi_col, int bw) {
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index f81690a..8001adb 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -132,14 +132,15 @@
/****************************************************************************
*/
-void vp9_post_proc_down_and_across_c(uint8_t *src_ptr,
+void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
uint8_t *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit) {
- uint8_t *p_src, *p_dst;
+ uint8_t const *p_src;
+ uint8_t *p_dst;
int row;
int col;
int i;
@@ -313,51 +314,52 @@
source->uv_height, source->uv_width, ppl);
}
-void vp9_deblock(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *post,
- int q,
- int low_var_thresh,
- int flag) {
- double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
- int ppl = (int)(level + .5);
- (void) low_var_thresh;
- (void) flag;
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ int q) {
+ const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+ + 0.0065 + 0.5);
+ int i;
- vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
- source->y_stride, post->y_stride,
- source->y_height, source->y_width, ppl);
+ const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+ const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+ const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+ const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
- vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
+ uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+ const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
- vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ vp9_post_proc_down_and_across(srcs[i], dsts[i],
+ src_strides[i], dst_strides[i],
+ src_heights[i], src_widths[i], ppl);
}
-void vp9_denoise(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *post,
- int q, int low_var_thresh, int flag) {
- double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
- int ppl = (int)(level + .5);
- (void) post;
- (void) low_var_thresh;
- (void) flag;
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ int q) {
+ const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+ + 0.0065 + 0.5);
+ int i;
- vp9_post_proc_down_and_across(src->y_buffer + 2 * src->y_stride + 2,
- src->y_buffer + 2 * src->y_stride + 2,
- src->y_stride, src->y_stride, src->y_height - 4,
- src->y_width - 4, ppl);
+ const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+ const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+ const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+ const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
- vp9_post_proc_down_and_across(src->u_buffer + 2 * src->uv_stride + 2,
- src->u_buffer + 2 * src->uv_stride + 2,
- src->uv_stride, src->uv_stride,
- src->uv_height - 4, src->uv_width - 4, ppl);
+ uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+ const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
- vp9_post_proc_down_and_across(src->v_buffer + 2 * src->uv_stride + 2,
- src->v_buffer + 2 * src->uv_stride + 2,
- src->uv_stride, src->uv_stride,
- src->uv_height - 4, src->uv_width - 4, ppl);
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ const int src_stride = src_strides[i];
+ const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+ const int src_width = src_widths[i] - 4;
+ const int src_height = src_heights[i] - 4;
+
+ const int dst_stride = dst_strides[i];
+ uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+
+ vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+ src_height, src_width, ppl);
+ }
}
double vp9_gaussian(double sigma, double mu, double x) {
@@ -642,7 +644,7 @@
deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
q + (deblock_level - 5) * 10, 1, 0);
} else if (flags & VP9D_DEBLOCK) {
- vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q, 1, 0);
+ vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);
} else {
vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
}
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index c2f556e..2c0d333 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -29,10 +29,8 @@
int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
vp9_ppflags_t *flags);
-void vp9_denoise(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,
- int q, int low_var_thresh, int flag);
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
-void vp9_deblock(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,
- int q, int low_var_thresh, int flag);
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
#endif // VP9_COMMON_VP9_POSTPROC_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 3668fcd..e7303f1 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -17,6 +17,78 @@
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
+static int scale_value_x_with_scaling(int val,
+ const struct scale_factors *scale) {
+ return val * scale->x_num / scale->x_den;
+}
+
+static int scale_value_y_with_scaling(int val,
+ const struct scale_factors *scale) {
+ return val * scale->y_num / scale->y_den;
+}
+
+static int unscaled_value(int val, const struct scale_factors *scale) {
+ (void) scale;
+ return val;
+}
+
+static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv,
+ const struct scale_factors *scale) {
+ // returns mv * scale + offset
+ int_mv32 result;
+ const int32_t mv_row_q4 = src_mv->as_mv.row << 1;
+ const int32_t mv_col_q4 = src_mv->as_mv.col << 1;
+
+ /* TODO(jkoleszar): make fixed point, or as a second multiply? */
+ result.as_mv.row = mv_row_q4 * scale->y_num / scale->y_den
+ + scale->y_offset_q4;
+ result.as_mv.col = mv_col_q4 * scale->x_num / scale->x_den
+ + scale->x_offset_q4;
+ return result;
+}
+
+static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv,
+ const struct scale_factors *scale) {
+ // returns mv * scale + offset
+ int_mv32 result;
+
+ result.as_mv.row = src_mv->as_mv.row << 1;
+ result.as_mv.col = src_mv->as_mv.col << 1;
+ return result;
+}
+
+static int32_t mv_component_q4_with_scaling(int mv_q4, int num, int den,
+ int offset_q4) {
+ // returns the scaled and offset value of the mv component.
+
+ /* TODO(jkoleszar): make fixed point, or as a second multiply? */
+ return mv_q4 * num / den + offset_q4;
+}
+
+static int32_t mv_component_q4_without_scaling(int mv_q4, int num, int den,
+ int offset_q4) {
+ // returns the scaled and offset value of the mv component.
+ (void)num;
+ (void)den;
+ (void)offset_q4;
+ return mv_q4;
+}
+
+static void set_offsets_with_scaling(struct scale_factors *scale,
+ int row, int col) {
+ const int x_q4 = 16 * col;
+ const int y_q4 = 16 * row;
+
+ scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;
+ scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;
+}
+
+static void set_offsets_without_scaling(struct scale_factors *scale,
+ int row, int col) {
+ scale->x_offset_q4 = 0;
+ scale->y_offset_q4 = 0;
+}
+
void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
int other_w, int other_h,
int this_w, int this_h) {
@@ -34,18 +106,14 @@
scale->scale_value_x = unscaled_value;
scale->scale_value_y = unscaled_value;
scale->set_scaled_offsets = set_offsets_without_scaling;
- scale->scale_motion_vector_q3_to_q4 =
- motion_vector_q3_to_q4_without_scaling;
- scale->scale_motion_vector_component_q4 =
- motion_vector_component_q4_without_scaling;
+ scale->scale_motion_vector_q3_to_q4 = mv_q3_to_q4_without_scaling;
+ scale->scale_motion_vector_component_q4 = mv_component_q4_without_scaling;
} else {
scale->scale_value_x = scale_value_x_with_scaling;
scale->scale_value_y = scale_value_y_with_scaling;
scale->set_scaled_offsets = set_offsets_with_scaling;
- scale->scale_motion_vector_q3_to_q4 =
- motion_vector_q3_to_q4_with_scaling;
- scale->scale_motion_vector_component_q4 =
- motion_vector_component_q4_with_scaling;
+ scale->scale_motion_vector_q3_to_q4 = mv_q3_to_q4_with_scaling;
+ scale->scale_motion_vector_component_q4 = mv_component_q4_with_scaling;
}
// TODO(agrange): Investigate the best choice of functions to use here
@@ -424,3 +492,18 @@
vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col,
BLOCK_SIZE_MB16X16);
}
+
+// TODO(dkovalev: find better place for this function)
+void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
+ const int ref = cm->active_ref_idx[i];
+ struct scale_factors *const sf = &cm->active_ref_scale[i];
+ if (ref >= NUM_YV12_BUFFERS) {
+ memset(sf, 0, sizeof(*sf));
+ } else {
+ YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
+ vp9_setup_scale_factors_for_frame(sf,
+ fb->y_crop_width, fb->y_crop_height,
+ cm->width, cm->height);
+ }
+}
+
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index faf018c..8f76195 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -52,21 +52,6 @@
int w, int h, int do_avg,
const struct subpix_fn_table *subpix);
-static int scale_value_x_with_scaling(int val,
- const struct scale_factors *scale) {
- return val * scale->x_num / scale->x_den;
-}
-
-static int scale_value_y_with_scaling(int val,
- const struct scale_factors *scale) {
- return val * scale->y_num / scale->y_den;
-}
-
-static int unscaled_value(int val, const struct scale_factors *scale) {
- (void) scale;
- return val;
-}
-
static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
const struct scale_factors *scale) {
const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;
@@ -137,66 +122,6 @@
xd->scale_factor_uv[1] = xd->scale_factor[1];
}
-static void set_offsets_with_scaling(struct scale_factors *scale,
- int row, int col) {
- const int x_q4 = 16 * col;
- const int y_q4 = 16 * row;
+void vp9_setup_scale_factors(VP9_COMMON *cm, int i);
- scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;
- scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;
-}
-
-static void set_offsets_without_scaling(struct scale_factors *scale,
- int row, int col) {
- scale->x_offset_q4 = 0;
- scale->y_offset_q4 = 0;
-}
-
-static int_mv32 motion_vector_q3_to_q4_with_scaling(
- const int_mv *src_mv,
- const struct scale_factors *scale) {
- // returns mv * scale + offset
- int_mv32 result;
- const int32_t mv_row_q4 = src_mv->as_mv.row << 1;
- const int32_t mv_col_q4 = src_mv->as_mv.col << 1;
-
- /* TODO(jkoleszar): make fixed point, or as a second multiply? */
- result.as_mv.row = mv_row_q4 * scale->y_num / scale->y_den
- + scale->y_offset_q4;
- result.as_mv.col = mv_col_q4 * scale->x_num / scale->x_den
- + scale->x_offset_q4;
- return result;
-}
-
-static int_mv32 motion_vector_q3_to_q4_without_scaling(
- const int_mv *src_mv,
- const struct scale_factors *scale) {
- // returns mv * scale + offset
- int_mv32 result;
-
- result.as_mv.row = src_mv->as_mv.row << 1;
- result.as_mv.col = src_mv->as_mv.col << 1;
- return result;
-}
-
-static int32_t motion_vector_component_q4_with_scaling(int mv_q4,
- int num,
- int den,
- int offset_q4) {
- // returns the scaled and offset value of the mv component.
-
- /* TODO(jkoleszar): make fixed point, or as a second multiply? */
- return mv_q4 * num / den + offset_q4;
-}
-
-static int32_t motion_vector_component_q4_without_scaling(int mv_q4,
- int num,
- int den,
- int offset_q4) {
- // returns the scaled and offset value of the mv component.
- (void)num;
- (void)den;
- (void)offset_q4;
- return mv_q4;
-}
#endif // VP9_COMMON_VP9_RECONINTER_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 75e3604..48ce7db 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -91,12 +91,6 @@
prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_8x8 sse2
-prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_16x16 sse2
-
-prototype void vp9_add_residual_32x32 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_32x32 sse2
-
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2
@@ -128,30 +122,6 @@
prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp9_loop_filter_bh8x8 sse2
-prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbv mmx sse2
-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
-
-prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbh mmx sse2
-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
-
-prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bv mmx sse2
-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
-
-prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bh mmx sse2
-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
-
prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
specialize vp9_lpf_mbh_w sse2
@@ -170,7 +140,7 @@
specialize vp9_mbpost_proc_across_ip sse2
vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm
-prototype void vp9_post_proc_down_and_across "uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
+prototype void vp9_post_proc_down_and_across "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
specialize vp9_post_proc_down_and_across mmx sse2
vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
@@ -227,24 +197,23 @@
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8
-prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct16x16 sse2
+prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_add sse2
-prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_16x16 sse2
+prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_16x16_add sse2
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16
-
-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct32x32 sse2
+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct32x32_add sse2
prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_32x32
-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_32x32
+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_32x32_add
prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht8x8
@@ -252,8 +221,8 @@
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4
-prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16
+prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_short_iht16x16_add
prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
specialize vp9_idct4_1d sse2
@@ -337,41 +306,74 @@
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x64 sse2
+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x64
+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x32
+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x16
+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x32
+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 sse2
+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x16 sse2 mmx
vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x8 sse2 mmx
vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x4 sse2 mmx
vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x64 sse2
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index dd7e68a..667da33 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -752,8 +752,17 @@
stp2_10, stp2_13, stp2_11, stp2_12) \
}
-void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ in_x = _mm_add_epi16(in_x, d0); \
+ in_x = _mm_packus_epi16(in_x, in_x); \
+ _mm_storel_epi64((__m128i *)(dest), in_x); \
+ dest += stride; \
+ }
+
+void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
@@ -938,31 +947,30 @@
in14 = _mm_srai_epi16(in14, 6);
in15 = _mm_srai_epi16(in15, 6);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
- _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
- _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
- _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
- _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
- _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
- _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
- _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
- _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
- output += 8;
+ dest += 8 - (stride * 16);
}
}
}
-void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
@@ -1007,7 +1015,6 @@
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
-
// 1-D idct. Load input data.
in0 = _mm_load_si128((__m128i *)input);
in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
@@ -1298,29 +1305,28 @@
in14 = _mm_srai_epi16(in14, 6);
in15 = _mm_srai_epi16(in15, 6);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
- _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
- _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
- _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
- _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
- _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
- _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
- _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
- _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
- output += 8;
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
+
+ dest += 8 - (stride * 16);
}
}
-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -1832,6 +1838,8 @@
col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
} else {
+ const __m128i zero = _mm_setzero_si128();
+
// 2_D: Calculate the results and store them to destination.
in0 = _mm_add_epi16(stp1_0, stp1_31);
in1 = _mm_add_epi16(stp1_1, stp1_30);
@@ -1933,41 +1941,40 @@
in30 = _mm_srai_epi16(in30, 6);
in31 = _mm_srai_epi16(in31, 6);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
- _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
- _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
- _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
- _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
- _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
- _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
- _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
- _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
- _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
- _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
- _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
- _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
- _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
- _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
- _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
- _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
- _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
- _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
- _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
- _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
- _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
- _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
- _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
- _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
+ RECON_AND_STORE(dest, in16);
+ RECON_AND_STORE(dest, in17);
+ RECON_AND_STORE(dest, in18);
+ RECON_AND_STORE(dest, in19);
+ RECON_AND_STORE(dest, in20);
+ RECON_AND_STORE(dest, in21);
+ RECON_AND_STORE(dest, in22);
+ RECON_AND_STORE(dest, in23);
+ RECON_AND_STORE(dest, in24);
+ RECON_AND_STORE(dest, in25);
+ RECON_AND_STORE(dest, in26);
+ RECON_AND_STORE(dest, in27);
+ RECON_AND_STORE(dest, in28);
+ RECON_AND_STORE(dest, in29);
+ RECON_AND_STORE(dest, in30);
+ RECON_AND_STORE(dest, in31);
- output += 8;
+ dest += 8 - (stride * 32);
}
}
}
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
index 2be9e31..7e6c4be 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
@@ -35,16 +35,6 @@
}
-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
- y_stride, blimit);
-}
-
/* Vertical B Filtering */
void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
unsigned char *u_ptr, unsigned char *v_ptr,
@@ -66,9 +56,3 @@
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 08447a6..7982ca6 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1115,16 +1115,6 @@
v_ptr + 4 * uv_stride);
}
-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
- y_stride, blimit);
-}
-
/* Vertical B Filtering */
void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
unsigned char *u_ptr, unsigned char *v_ptr,
@@ -1143,9 +1133,3 @@
v_ptr + 4);
}
-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index ceffdf5..4ebb51b 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -593,349 +593,6 @@
pop rbp
ret
-
-;void vp9_loop_filter_simple_horizontal_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- mov rcx, 2 ; count
-.nexts8_h:
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm3, [rdx] ;
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
- neg rax
-
- ; calculate mask
- movq mm1, [rsi+2*rax] ; p1
- movq mm0, [rdi] ; q1
- movq mm2, mm1
- movq mm7, mm0
- movq mm4, mm0
- psubusb mm0, mm1 ; q1-=p1
- psubusb mm1, mm4 ; p1-=q1
- por mm1, mm0 ; abs(p1-q1)
- pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm1, 1 ; abs(p1-q1)/2
-
- movq mm5, [rsi+rax] ; p0
- movq mm4, [rsi] ; q0
- movq mm0, mm4 ; q0
- movq mm6, mm5 ; p0
- psubusb mm5, mm4 ; p0-=q0
- psubusb mm4, mm6 ; q0-=p0
- por mm5, mm4 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor mm3, mm3
- pcmpeqb mm5, mm3
-
- ; start work on filters
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
-
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
- paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
- paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
- pand mm5, mm2 ; mask filter values we don't care about
-
- ; do + 4 side
- paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
- movq mm1, mm5 ; get a copy of filters
- psraw mm1, 11 ; arithmetic shift right 11
- psllw mm1, 8 ; shift left 8 to put it back
-
- por mm0, mm1 ; put the two together to get result
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
- movq [rsi], mm3 ; write back
-
-
- ; now do +3 side
- psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
- psraw mm5, 11 ; arithmetic shift right 11
- psllw mm5, 8 ; shift left 8 to put it back
- por mm0, mm5 ; put the two together to get result
-
-
- paddsb mm6, mm0 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+rax], mm6 ; write back
-
- add rsi,8
- neg rax
- dec rcx
- jnz .nexts8_h
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4- 2]; ;
- mov rcx, 2 ; count
-.nexts8_v:
-
- lea rdi, [rsi + rax];
- movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
-
- movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
- punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
-
- movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
- movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
-
- punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
- movq mm5, mm4 ; 53 43 52 42 51 41 50 40
-
- punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
- punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
-
- neg rax
-
- movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
- movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
-
- punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
- movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
-
- movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
- punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
-
- movq mm2, mm0 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 13 03 12 02 11 01 10 00
-
- punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
- movq mm3, mm2 ; 33 23 13 03 32 22 12 02
-
- punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
- punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
-
- punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
-
-
- ; calculate mask
- movq mm6, mm0 ; p1
- movq mm7, mm3 ; q1
- psubusb mm7, mm6 ; q1-=p1
- psubusb mm6, mm3 ; p1-=q1
- por mm6, mm7 ; abs(p1-q1)
- pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm6, 1 ; abs(p1-q1)/2
-
- movq mm5, mm1 ; p0
- movq mm4, mm2 ; q0
-
- psubusb mm5, mm2 ; p0-=q0
- psubusb mm4, mm1 ; q0-=p0
-
- por mm5, mm4 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm7, [rdx]
-
- psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor mm7, mm7
- pcmpeqb mm5, mm7 ; mm5 = mask
-
- ; start work on filters
- movq t0, mm0
- movq t1, mm3
-
- pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb mm0, mm3 ; p1 - q1
- movq mm6, mm1 ; p0
-
- movq mm7, mm2 ; q0
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm7 ; offseted ; q0
-
- psubsb mm7, mm6 ; q0 - p0
- paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
-
- paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
- paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
-
- pand mm5, mm0 ; mask filter values we don't care about
-
- paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
-
- movq mm7, mm5 ; get a copy of filters
- psraw mm7, 11 ; arithmetic shift right 11
- psllw mm7, 8 ; shift left 8 to put it back
-
- por mm0, mm7 ; put the two together to get result
-
- psubsb mm3, mm0 ; q0-= q0sz add
- pxor mm3, [GLOBAL(t80)] ; unoffset
-
- ; now do +3 side
- psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
-
- psraw mm5, 11 ; arithmetic shift right 11
- psllw mm5, 8 ; shift left 8 to put it back
- por mm0, mm5 ; put the two together to get result
-
- paddsb mm6, mm0 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
-
-
- movq mm0, t0
- movq mm4, t1
-
- ; mm0 = 70 60 50 40 30 20 10 00
- ; mm6 = 71 61 51 41 31 21 11 01
- ; mm3 = 72 62 52 42 32 22 12 02
- ; mm4 = 73 63 53 43 33 23 13 03
- ; transpose back to write out
-
- movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
-
- punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
- movq mm2, mm3 ;
-
- punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
- movq mm5, mm1 ; 71 70 61 60 51 50 41 40
-
- punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
- movq mm6, mm0 ; 31 30 21 20 11 10 01 00
-
- punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
- punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
-
- movd [rsi+rax*4], mm0 ; write 03 02 01 00
- punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
-
- psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
- punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
-
- movd [rdi+rax*4], mm0 ; write 13 12 11 10
- movd [rsi+rax*2], mm6 ; write 23 22 21 20
-
- psrlq mm6, 32 ; 33 32 31 30
- movd [rsi], mm1 ; write 43 42 41 40
-
- movd [rsi + rax], mm6 ; write 33 32 31 30
- neg rax
-
- movd [rsi + rax*2], mm5 ; write 63 62 61 60
- psrlq mm1, 32 ; 53 52 51 50
-
- movd [rdi], mm1 ; write out 53 52 51 50
- psrlq mm5, 32 ; 73 72 71 70
-
- movd [rdi + rax*2], mm5 ; write 73 72 71 70
-
- lea rsi, [rsi+rax*8] ; next 8
-
- dec rcx
- jnz .nexts8_v
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-; int y_stride,
-; loop_filter_info *lfi)
-;{
-;
-;
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
SECTION_RODATA
align 16
tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm
index ae4c60f..74236cf 100644
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@@ -845,372 +845,6 @@
pop rbp
ret
-;void vp9_loop_filter_simple_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
- mov rdx, arg(2) ;blimit
- movdqa xmm3, XMMWORD PTR [rdx]
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
- neg rax
-
- ; calculate mask
- movdqa xmm1, [rsi+2*rax] ; p1
- movdqa xmm0, [rdi] ; q1
- movdqa xmm2, xmm1
- movdqa xmm7, xmm0
- movdqa xmm4, xmm0
- psubusb xmm0, xmm1 ; q1-=p1
- psubusb xmm1, xmm4 ; p1-=q1
- por xmm1, xmm0 ; abs(p1-q1)
- pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw xmm1, 1 ; abs(p1-q1)/2
-
- movdqa xmm5, [rsi+rax] ; p0
- movdqa xmm4, [rsi] ; q0
- movdqa xmm0, xmm4 ; q0
- movdqa xmm6, xmm5 ; p0
- psubusb xmm5, xmm4 ; p0-=q0
- psubusb xmm4, xmm6 ; q0-=p0
- por xmm5, xmm4 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm3, xmm3
- pcmpeqb xmm5, xmm3
-
- ; start work on filters
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb xmm2, xmm7 ; p1 - q1
-
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
- paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
- paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
- pand xmm5, xmm2 ; mask filter values we don't care about
-
- ; do + 4 side
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
- movdqa xmm1, xmm5 ; get a copy of filters
- psraw xmm1, 11 ; arithmetic shift right 11
- psllw xmm1, 8 ; shift left 8 to put it back
-
- por xmm0, xmm1 ; put the two together to get result
-
- psubsb xmm3, xmm0 ; q0-= q0 add
- pxor xmm3, [GLOBAL(t80)] ; unoffset
- movdqa [rsi], xmm3 ; write back
-
- ; now do +3 side
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
- psraw xmm5, 11 ; arithmetic shift right 11
- psllw xmm5, 8 ; shift left 8 to put it back
- por xmm0, xmm5 ; put the two together to get result
-
-
- paddsb xmm6, xmm0 ; p0+= p0 add
- pxor xmm6, [GLOBAL(t80)] ; unoffset
- movdqa [rsi+rax], xmm6 ; write back
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_sse2):
- push rbp ; save old base pointer value.
- mov rbp, rsp ; set new base pointer value.
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx ; save callee-saved reg
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi - 2 ]
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
- movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
- movd xmm2, [rdi] ; 13 12 11 10
- movd xmm3, [rcx] ; 53 52 51 50
- punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
- punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
-
- movd xmm4, [rsi + rax*2] ; 23 22 21 20
- movd xmm5, [rdx + rax*2] ; 63 62 61 60
- movd xmm6, [rdi + rax*2] ; 33 32 31 30
- movd xmm7, [rcx + rax*2] ; 73 72 71 70
- punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
- punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
-
- punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
- punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
- movdqa xmm2, xmm0
- punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- movdqa t0, xmm0 ; save to t0
- movdqa t1, xmm2 ; save to t1
-
- lea rsi, [rsi + rax*8]
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd xmm4, [rsi] ; 83 82 81 80
- movd xmm1, [rdx] ; c3 c2 c1 c0
- movd xmm6, [rdi] ; 93 92 91 90
- movd xmm3, [rcx] ; d3 d2 d1 d0
- punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
- punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
-
- movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
- movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
- movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
- movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
- punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
- punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
-
- punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
- punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
- movdqa xmm1, xmm4
- punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
- punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
- movdqa xmm6, xmm4
- punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
- punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
- movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
-
- punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- ; calculate mask
- movdqa xmm6, xmm0 ; p1
- movdqa xmm7, xmm3 ; q1
- psubusb xmm7, xmm0 ; q1-=p1
- psubusb xmm6, xmm3 ; p1-=q1
- por xmm6, xmm7 ; abs(p1-q1)
- pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw xmm6, 1 ; abs(p1-q1)/2
-
- movdqa xmm5, xmm1 ; p0
- movdqa xmm4, xmm2 ; q0
- psubusb xmm5, xmm2 ; p0-=q0
- psubusb xmm4, xmm1 ; q0-=p0
- por xmm5, xmm4 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit
- movdqa xmm7, XMMWORD PTR [rdx]
-
- psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm7, xmm7
- pcmpeqb xmm5, xmm7 ; mm5 = mask
-
- ; start work on filters
- movdqa t0, xmm0
- movdqa t1, xmm3
-
- pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb xmm0, xmm3 ; p1 - q1
- movdqa xmm6, xmm1 ; p0
-
- movdqa xmm7, xmm2 ; q0
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values
- movdqa xmm3, xmm7 ; offseted ; q0
-
- psubsb xmm7, xmm6 ; q0 - p0
- paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0)
-
- paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0)
-
- pand xmm5, xmm0 ; mask filter values we don't care about
-
-
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
-
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
-
- movdqa xmm7, xmm5 ; get a copy of filters
- psraw xmm7, 11 ; arithmetic shift right 11
-
- psllw xmm7, 8 ; shift left 8 to put it back
- por xmm0, xmm7 ; put the two together to get result
-
- psubsb xmm3, xmm0 ; q0-= q0sz add
- pxor xmm3, [GLOBAL(t80)] ; unoffset q0
-
- ; now do +3 side
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
- movdqa xmm0, xmm5 ; get a copy of filters
-
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
-
- psrlw xmm0, 8
- psraw xmm5, 11 ; arithmetic shift right 11
-
- psllw xmm5, 8 ; shift left 8 to put it back
- por xmm0, xmm5 ; put the two together to get result
-
- paddsb xmm6, xmm0 ; p0+= p0 add
- pxor xmm6, [GLOBAL(t80)] ; unoffset p0
-
- movdqa xmm0, t0 ; p1
- movdqa xmm4, t1 ; q1
-
- ; transpose back to write out
- ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
- movdqa xmm5, xmm3
- punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
- punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
- ; write out order: xmm0 xmm2 xmm1 xmm3
- lea rdx, [rsi + rax*4]
-
- movd [rsi], xmm1 ; write the second 8-line result
- psrldq xmm1, 4
- movd [rdi], xmm1
- psrldq xmm1, 4
- movd [rsi + rax*2], xmm1
- psrldq xmm1, 4
- movd [rdi + rax*2], xmm1
-
- movd [rdx], xmm3
- psrldq xmm3, 4
- movd [rcx], xmm3
- psrldq xmm3, 4
- movd [rdx + rax*2], xmm3
- psrldq xmm3, 4
- movd [rcx + rax*2], xmm3
-
- neg rax
- lea rsi, [rsi + rax*8]
- neg rax
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd [rsi], xmm0 ; write the first 8-line result
- psrldq xmm0, 4
- movd [rdi], xmm0
- psrldq xmm0, 4
- movd [rsi + rax*2], xmm0
- psrldq xmm0, 4
- movd [rdi + rax*2], xmm0
-
- movd [rdx], xmm2
- psrldq xmm2, 4
- movd [rcx], xmm2
- psrldq xmm2, 4
- movd [rdx + rax*2], xmm2
- psrldq xmm2, 4
- movd [rcx + rax*2], xmm2
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
SECTION_RODATA
align 16
tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_x86.h b/vp9/common/x86/vp9_loopfilter_x86.h
index 46a6202..fb5af05 100644
--- a/vp9/common/x86/vp9_loopfilter_x86.h
+++ b/vp9/common/x86/vp9_loopfilter_x86.h
@@ -23,10 +23,6 @@
extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
#endif
#if HAVE_SSE2
@@ -34,10 +30,6 @@
extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
#endif
#endif // LOOPFILTER_X86_H
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index a1f780a..46d21b9 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -119,13 +119,25 @@
m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
// luma mode
+#if CONFIG_AB4X4
+ if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
+ m->mbmi.mode = read_kf_sb_ymode(r,
+ cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]);
+ else
+ m->mbmi.mode = I4X4_PRED;
+#else
m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_SB8X8 ?
read_kf_sb_ymode(r, cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]):
read_kf_mb_ymode(r, cm->kf_ymode_prob[cm->kf_ymode_probs_index]);
+#endif
m->mbmi.ref_frame = INTRA_FRAME;
+#if CONFIG_AB4X4
+ if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+#else
if (m->mbmi.mode == I4X4_PRED) {
+#endif
int i;
for (i = 0; i < 4; ++i) {
const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
@@ -139,7 +151,13 @@
m->mbmi.uv_mode = read_uv_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
if (cm->txfm_mode == TX_MODE_SELECT &&
- !m->mbmi.mb_skip_coeff && m->mbmi.mode != I4X4_PRED) {
+ !m->mbmi.mb_skip_coeff &&
+#if CONFIG_AB4X4
+ m->mbmi.sb_type >= BLOCK_SIZE_SB8X8
+#else
+ m->mbmi.mode != I4X4_PRED
+#endif
+ ) {
const int allow_16x16 = m->mbmi.sb_type >= BLOCK_SIZE_MB16X16;
const int allow_32x32 = m->mbmi.sb_type >= BLOCK_SIZE_SB32X32;
m->mbmi.txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
@@ -150,7 +168,13 @@
m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 &&
m->mbmi.mode <= TM_PRED) {
m->mbmi.txfm_size = TX_16X16;
- } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != I4X4_PRED) {
+ } else if (cm->txfm_mode >= ALLOW_8X8 &&
+#if CONFIG_AB4X4
+ m->mbmi.sb_type >= BLOCK_SIZE_SB8X8
+#else
+ m->mbmi.mode != I4X4_PRED
+#endif
+ ) {
m->mbmi.txfm_size = TX_8X8;
} else {
m->mbmi.txfm_size = TX_4X4;
@@ -618,9 +642,16 @@
if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
mbmi->mode = ZEROMV;
} else {
+#if CONFIG_AB4X4
+ if (mbmi->sb_type >= BLOCK_SIZE_SB8X8)
+ mbmi->mode = read_sb_mv_ref(r, mv_ref_p);
+ else
+ mbmi->mode = SPLITMV;
+#else
mbmi->mode = mbmi->sb_type > BLOCK_SIZE_SB8X8 ?
read_sb_mv_ref(r, mv_ref_p)
: read_mv_ref(r, mv_ref_p);
+#endif
vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref_frame]);
}
@@ -820,6 +851,14 @@
// required for left and above block mv
mv0->as_int = 0;
+#if CONFIG_AB4X4
+ if (mbmi->sb_type >= BLOCK_SIZE_SB8X8) {
+ mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
+ cm->fc.sb_ymode_counts[mbmi->mode]++;
+ } else {
+ mbmi->mode = I4X4_PRED;
+ }
+#else
if (mbmi->sb_type > BLOCK_SIZE_SB8X8) {
mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
cm->fc.sb_ymode_counts[mbmi->mode]++;
@@ -827,9 +866,14 @@
mbmi->mode = read_ymode(r, cm->fc.ymode_prob);
cm->fc.ymode_counts[mbmi->mode]++;
}
+#endif
// If MB mode is I4X4_PRED read the block modes
+#if CONFIG_AB4X4
+ if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+#else
if (mbmi->mode == I4X4_PRED) {
+#endif
int j = 0;
do {
int m = read_bmode(r, cm->fc.bmode_prob);
@@ -842,9 +886,14 @@
cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
}
+#if CONFIG_AB4X4
+ if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
+ mbmi->sb_type >= BLOCK_SIZE_SB8X8) {
+#else
if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
((mbmi->ref_frame == INTRA_FRAME && mbmi->mode != I4X4_PRED) ||
(mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+#endif
const int allow_16x16 = mbmi->sb_type >= BLOCK_SIZE_MB16X16;
const int allow_32x32 = mbmi->sb_type >= BLOCK_SIZE_SB32X32;
mbmi->txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
@@ -852,13 +901,21 @@
cm->txfm_mode >= ALLOW_32X32) {
mbmi->txfm_size = TX_32X32;
} else if (cm->txfm_mode >= ALLOW_16X16 &&
- mbmi->sb_type >= BLOCK_SIZE_MB16X16 &&
- ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
- (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+ mbmi->sb_type >= BLOCK_SIZE_MB16X16
+#if !CONFIG_AB4X4
+ && ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
+ (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))
+#endif
+ ) {
mbmi->txfm_size = TX_16X16;
} else if (cm->txfm_mode >= ALLOW_8X8 &&
+#if CONFIG_AB4X4
+ (mbmi->sb_type >= BLOCK_SIZE_SB8X8))
+#else
(!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == I4X4_PRED) &&
- !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV))) {
+ !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV)))
+#endif
+ {
mbmi->txfm_size = TX_8X8;
} else {
mbmi->txfm_size = TX_4X4;
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 4be3677..2e233c3 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -417,10 +417,14 @@
vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
set_refs(pbi, mi_row, mi_col);
+#if CONFIG_AB4X4
+ if (bsize < BLOCK_SIZE_SB8X8)
+#else
if (bsize == BLOCK_SIZE_SB8X8 &&
(xd->mode_info_context->mbmi.mode == SPLITMV ||
xd->mode_info_context->mbmi.mode == I4X4_PRED))
- decode_atom(pbi, xd, mi_row, mi_col, r, bsize);
+#endif
+ decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
else
decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
@@ -439,7 +443,17 @@
if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
return;
+#if CONFIG_AB4X4
+ if (bsize < BLOCK_SIZE_SB8X8)
+ if (xd->ab_index != 0)
+ return;
+#endif
+
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+#else
if (bsize > BLOCK_SIZE_SB8X8) {
+#endif
int pl;
// read the partition information
xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
@@ -451,6 +465,7 @@
}
subsize = get_subsize(bsize, partition);
+
switch (partition) {
case PARTITION_NONE:
decode_modes_b(pbi, mi_row, mi_col, r, subsize);
@@ -468,12 +483,7 @@
case PARTITION_SPLIT:
for (n = 0; n < 4; n++) {
int j = n >> 1, i = n & 0x01;
- if (subsize == BLOCK_SIZE_SB32X32)
- xd->sb_index = n;
- else if (subsize == BLOCK_SIZE_MB16X16)
- xd->mb_index = n;
- else
- xd->b_index = n;
+ *(get_sb_index(xd, subsize)) = n;
decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
}
break;
@@ -481,12 +491,16 @@
assert(0);
}
// update partition context
- if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
- return;
-
- xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
- xd->above_seg_context = pc->above_seg_context + mi_col;
- update_partition_context(xd, subsize, bsize);
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8 &&
+ (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+#else
+ if (bsize > BLOCK_SIZE_SB8X8 &&
+ (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+#endif
+ set_partition_seg_context(pc, xd, mi_row, mi_col);
+ update_partition_context(xd, subsize, bsize);
+ }
}
static void setup_token_decoder(VP9D_COMP *pbi,
@@ -811,12 +825,12 @@
int mi_row, mi_col;
for (mi_row = pc->cur_tile_mi_row_start;
- mi_row < pc->cur_tile_mi_row_end; mi_row += 8) {
+ mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
// For a SB there are 2 left contexts, each pertaining to a MB row within
vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
for (mi_col = pc->cur_tile_mi_col_start;
- mi_col < pc->cur_tile_mi_col_end; mi_col += 8)
+ mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
}
}
@@ -1007,23 +1021,13 @@
// Select active reference frames and calculate scaling factors
for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
const int ref = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2);
- const int mapped_ref = pc->ref_frame_map[ref];
- YV12_BUFFER_CONFIG *const fb = &pc->yv12_fb[mapped_ref];
- struct scale_factors *const sf = &pc->active_ref_scale[i];
-
- pc->active_ref_idx[i] = mapped_ref;
- if (mapped_ref >= NUM_YV12_BUFFERS)
- memset(sf, 0, sizeof(*sf));
- else
- vp9_setup_scale_factors_for_frame(sf,
- fb->y_crop_width, fb->y_crop_height,
- pc->width, pc->height);
+ pc->active_ref_idx[i] = pc->ref_frame_map[ref];
+ vp9_setup_scale_factors(pc, i);
}
// Read the sign bias for each reference frame buffer.
- for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+ for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
pc->ref_frame_sign_bias[i + 1] = vp9_read_bit(&header_bc);
- }
xd->allow_high_precision_mv = vp9_read_bit(&header_bc);
pc->mcomp_filter_type = read_mcomp_filter_type(&header_bc);
@@ -1105,8 +1109,8 @@
if (pc->frame_type != KEY_FRAME) {
vp9_adapt_mode_probs(pc);
- vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
vp9_adapt_mode_context(pc);
+ vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
}
}
diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodframe.h
index 3aaae65..00b6d67 100644
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@@ -13,7 +13,9 @@
#define VP9_DECODER_VP9_DECODFRAME_H_
struct VP9Common;
+struct VP9Decompressor;
void vp9_init_dequantizer(struct VP9Common *pc);
+int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
#endif // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index ce2a86b..22d3cf8 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -62,7 +62,7 @@
#define INCREMENT_COUNT(token) \
do { \
- coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
+ coef_counts[type][ref][band] \
[pt][token]++; \
token_cache[scan[c]] = token; \
} while (0)
@@ -76,12 +76,6 @@
continue; \
}
-#define WRITE_COEF_ONE() \
-{ \
- qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(br, 1); \
- INCREMENT_COUNT(ONE_TOKEN); \
-}
-
#define ADJUST_COEF(prob, bits_count) \
do { \
if (vp9_read(r, prob)) \
@@ -96,6 +90,7 @@
ENTROPY_CONTEXT above_ec, left_ec;
FRAME_CONTEXT *const fc = &dx->common.fc;
int pt, c = 0, pad, default_eob;
+ int band;
vp9_coeff_probs *coef_probs;
vp9_prob *prob;
vp9_coeff_count *coef_counts;
@@ -103,6 +98,7 @@
TX_TYPE tx_type = DCT_DCT;
const int *scan, *nb;
uint8_t token_cache[1024];
+ const uint8_t * band_translate;
switch (txfm_size) {
default:
@@ -115,6 +111,7 @@
coef_probs = fc->coef_probs_4x4;
coef_counts = fc->coef_counts_4x4;
default_eob = 16;
+ band_translate = vp9_coefband_trans_4x4;
break;
}
case TX_8X8: {
@@ -130,6 +127,7 @@
above_ec = (A[0] + A[1]) != 0;
left_ec = (L[0] + L[1]) != 0;
default_eob = 64;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_16X16: {
@@ -145,6 +143,7 @@
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
default_eob = 256;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_32X32:
@@ -154,6 +153,7 @@
above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
default_eob = 1024;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
@@ -162,14 +162,13 @@
while (1) {
int val;
- int band;
const uint8_t *cat6 = cat6_prob;
if (c >= seg_eob)
break;
if (c)
pt = vp9_get_coef_context(scan, nb, pad, token_cache,
c, default_eob);
- band = get_coef_band(scan, txfm_size, c);
+ band = get_coef_band(band_translate, c);
prob = coef_probs[type][ref][band][pt];
fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
@@ -181,8 +180,9 @@
if (c)
pt = vp9_get_coef_context(scan, nb, pad, token_cache,
c, default_eob);
- band = get_coef_band(scan, txfm_size, c);
+ band = get_coef_band(band_translate, c);
prob = coef_probs[type][ref][band][pt];
+
if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
INCREMENT_COUNT(ZERO_TOKEN);
++c;
@@ -249,8 +249,7 @@
}
if (c < seg_eob)
- coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]
- [pt][DCT_EOB_TOKEN]++;
+ coef_counts[type][ref][band][pt][DCT_EOB_TOKEN]++;
for (pt = 0; pt < (1 << txfm_size); pt++) {
A[pt] = L[pt] = c > 0;
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 3480df2..bc943fa 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -105,14 +105,6 @@
add_residual(diff, dest, stride, 8, 8);
}
-void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
- add_residual(diff, dest, stride, 16, 16);
-}
-
-void vp9_add_residual_32x32_c(const int16_t *diff, uint8_t *dest, int stride) {
- add_residual(diff, dest, stride, 32, 32);
-}
-
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
@@ -264,19 +256,14 @@
if (tx_type == DCT_DCT) {
vp9_idct_add_16x16(input, dest, stride, eob);
} else {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
if (eob > 0) {
- vp9_short_iht16x16(input, output, 16, tx_type);
+ vp9_short_iht16x16_add(input, dest, stride, tx_type);
vpx_memset(input, 0, 512);
- vp9_add_residual_16x16(output, dest, stride);
}
}
}
void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
if (eob) {
@@ -292,21 +279,15 @@
vp9_add_constant_residual_16x16(out, dest, stride);
#if !CONFIG_SCATTERSCAN
} else if (eob <= 10) {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct10_16x16(input, output, 32);
-
+ vp9_short_idct10_16x16_add(input, dest, stride);
input[0] = input[1] = input[2] = input[3] = 0;
input[16] = input[17] = input[18] = 0;
input[32] = input[33] = 0;
input[48] = 0;
-
- vp9_add_residual_16x16(output, dest, stride);
#endif
} else {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct16x16(input, output, 16 << 1);
+ vp9_short_idct16x16_add(input, dest, stride);
vpx_memset(input, 0, 512);
- vp9_add_residual_16x16(output, dest, stride);
}
}
}
@@ -321,20 +302,16 @@
input[0] = 0;
#if !CONFIG_SCATTERSCAN
} else if (eob <= 10) {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct10_32x32(input, output, 64);
-
+ vp9_short_idct10_32x32_add_c(input, dest, stride);
input[0] = input[1] = input[2] = input[3] = 0;
input[32] = input[33] = input[34] = 0;
input[64] = input[65] = 0;
input[96] = 0;
- vp9_add_residual_32x32(output, dest, stride);
#endif
} else {
- vp9_short_idct32x32(input, output, 64);
+ vp9_short_idct32x32_add(input, dest, stride);
vpx_memset(input, 0, 2048);
- vp9_add_residual_32x32(output, dest, stride);
}
}
}
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index a7d444e..8698570 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -41,8 +41,6 @@
int initial_height;
} VP9D_COMP;
-int vp9_decode_frame(VP9D_COMP *cpi, const uint8_t **p_data_end);
-
#if CONFIG_DEBUG
#define CHECK_MEM_ERROR(lval,expr) do {\
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 1296b70..796fc12 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -122,124 +122,6 @@
_mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
}
-void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
- int stride) {
- const int width = 16;
- int i = 4;
- const __m128i zero = _mm_setzero_si128();
-
- // Diff data
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
- do {
- d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
- d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
- d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
- d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
- d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
- d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
- d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
- d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
-
- // Prediction data.
- p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
- p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
- p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
- p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-
- p0 = _mm_unpacklo_epi8(p1, zero);
- p1 = _mm_unpackhi_epi8(p1, zero);
- p2 = _mm_unpacklo_epi8(p3, zero);
- p3 = _mm_unpackhi_epi8(p3, zero);
- p4 = _mm_unpacklo_epi8(p5, zero);
- p5 = _mm_unpackhi_epi8(p5, zero);
- p6 = _mm_unpacklo_epi8(p7, zero);
- p7 = _mm_unpackhi_epi8(p7, zero);
-
- p0 = _mm_add_epi16(p0, d0);
- p1 = _mm_add_epi16(p1, d1);
- p2 = _mm_add_epi16(p2, d2);
- p3 = _mm_add_epi16(p3, d3);
- p4 = _mm_add_epi16(p4, d4);
- p5 = _mm_add_epi16(p5, d5);
- p6 = _mm_add_epi16(p6, d6);
- p7 = _mm_add_epi16(p7, d7);
-
- p0 = _mm_packus_epi16(p0, p1);
- p1 = _mm_packus_epi16(p2, p3);
- p2 = _mm_packus_epi16(p4, p5);
- p3 = _mm_packus_epi16(p6, p7);
-
- _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
- _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
- _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
- _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-
- diff += 4 * width;
- dest += 4 * stride;
- } while (--i);
-}
-
-void vp9_add_residual_32x32_sse2(const int16_t *diff, uint8_t *dest,
- int stride) {
- const int width = 32;
- int i = 16;
- const __m128i zero = _mm_setzero_si128();
-
- // Diff data
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
- do {
- d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
- d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
- d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));
- d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));
- d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));
- d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
- d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));
- d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));
-
- // Prediction data.
- p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
- p3 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
- p5 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
- p7 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
-
- p0 = _mm_unpacklo_epi8(p1, zero);
- p1 = _mm_unpackhi_epi8(p1, zero);
- p2 = _mm_unpacklo_epi8(p3, zero);
- p3 = _mm_unpackhi_epi8(p3, zero);
- p4 = _mm_unpacklo_epi8(p5, zero);
- p5 = _mm_unpackhi_epi8(p5, zero);
- p6 = _mm_unpacklo_epi8(p7, zero);
- p7 = _mm_unpackhi_epi8(p7, zero);
-
- p0 = _mm_add_epi16(p0, d0);
- p1 = _mm_add_epi16(p1, d1);
- p2 = _mm_add_epi16(p2, d2);
- p3 = _mm_add_epi16(p3, d3);
- p4 = _mm_add_epi16(p4, d4);
- p5 = _mm_add_epi16(p5, d5);
- p6 = _mm_add_epi16(p6, d6);
- p7 = _mm_add_epi16(p7, d7);
-
- p0 = _mm_packus_epi16(p0, p1);
- p1 = _mm_packus_epi16(p2, p3);
- p2 = _mm_packus_epi16(p4, p5);
- p3 = _mm_packus_epi16(p6, p7);
-
- _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
- _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
- _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
- _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-
- diff += 2 * width;
- dest += 2 * stride;
- } while (--i);
-}
-
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index da0bb21..3985451 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -629,12 +629,21 @@
active_section = 6;
#endif
+#if CONFIG_AB4X4
+ if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
+ write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
+#else
if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
else
write_ymode(bc, mode, pc->fc.ymode_prob);
+#endif
+#if CONFIG_AB4X4
+ if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+#else
if (mode == I4X4_PRED) {
+#endif
int j = 0;
do {
write_bmode(bc, m->bmi[j].as_mode.first,
@@ -654,11 +663,16 @@
// If segment skip is not enabled code the mode.
if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
+#if CONFIG_AB4X4
+ if (mi->sb_type >= BLOCK_SIZE_SB8X8)
+ write_sb_mv_ref(bc, mode, mv_ref_p);
+#else
if (mi->sb_type > BLOCK_SIZE_SB8X8) {
write_sb_mv_ref(bc, mode, mv_ref_p);
} else {
write_mv_ref(bc, mode, mv_ref_p);
}
+#endif
vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
}
@@ -744,11 +758,20 @@
}
}
+#if CONFIG_AB4X4
+ if (((rf == INTRA_FRAME && mi->sb_type >= BLOCK_SIZE_SB8X8) ||
+ (rf != INTRA_FRAME && mi->sb_type >= BLOCK_SIZE_SB8X8)) &&
+ pc->txfm_mode == TX_MODE_SELECT &&
+ !(skip_coeff || vp9_segfeature_active(xd, segment_id,
+ SEG_LVL_SKIP)))
+#else
if (((rf == INTRA_FRAME && mode != I4X4_PRED) ||
(rf != INTRA_FRAME && mode != SPLITMV)) &&
pc->txfm_mode == TX_MODE_SELECT &&
!(skip_coeff || vp9_segfeature_active(xd, segment_id,
- SEG_LVL_SKIP))) {
+ SEG_LVL_SKIP)))
+#endif
+ {
TX_SIZE sz = mi->txfm_size;
// FIXME(rbultje) code ternary symbol once all experiments are merged
vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
@@ -780,12 +803,21 @@
vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
}
+#if CONFIG_AB4X4
+ if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
+ sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+#else
if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
else
kfwrite_ymode(bc, ym, c->kf_ymode_prob[c->kf_ymode_probs_index]);
+#endif
+#if CONFIG_AB4X4
+ if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+#else
if (ym == I4X4_PRED) {
+#endif
int i = 0;
do {
const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
@@ -803,8 +835,13 @@
write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+#if CONFIG_AB4X4
+ if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT &&
+ !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
+#else
if (ym != I4X4_PRED && c->txfm_mode == TX_MODE_SELECT &&
!(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
+#endif
TX_SIZE sz = m->mbmi.txfm_size;
// FIXME(rbultje) code ternary symbol once all experiments are merged
vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
@@ -876,7 +913,19 @@
else
assert(0);
+#if CONFIG_AB4X4
+ if (bsize == BLOCK_SIZE_SB8X8 && m->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+ partition = PARTITION_SPLIT;
+ if (bsize < BLOCK_SIZE_SB8X8)
+ if (xd->ab_index != 0)
+ return;
+#endif
+
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+#else
if (bsize > BLOCK_SIZE_SB8X8) {
+#endif
int pl;
xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
xd->above_seg_context = cm->above_seg_context + mi_col;
@@ -905,6 +954,7 @@
case PARTITION_SPLIT:
for (n = 0; n < 4; n++) {
int j = n >> 1, i = n & 0x01;
+ *(get_sb_index(xd, subsize)) = n;
write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
mi_row + j * bs, mi_col + i * bs, subsize);
}
@@ -914,12 +964,16 @@
}
// update partition context
- if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
- return;
-
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
- xd->above_seg_context = cm->above_seg_context + mi_col;
- update_partition_context(xd, subsize, bsize);
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8 &&
+ (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+#else
+ if (bsize > BLOCK_SIZE_SB8X8 &&
+ (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+#endif
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ update_partition_context(xd, subsize, bsize);
+ }
}
static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
@@ -1242,16 +1296,6 @@
FILE *vpxlogc = 0;
#endif
-static void put_delta_q(vp9_writer *bc, int delta_q) {
- if (delta_q != 0) {
- vp9_write_bit(bc, 1);
- vp9_write_literal(bc, abs(delta_q), 4);
- vp9_write_bit(bc, delta_q < 0);
- } else {
- vp9_write_bit(bc, 0);
- }
-}
-
static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
int mode_cost[MB_MODE_COUNT];
int bestcost = INT_MAX;
@@ -1298,9 +1342,21 @@
}
}
-static void encode_loopfilter(MACROBLOCKD *xd, vp9_writer *w) {
+static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_writer *w) {
int i;
+ // Encode the loop filter level and type
+ vp9_write_literal(w, pc->filter_level, 6);
+ vp9_write_literal(w, pc->sharpness_level, 3);
+#if CONFIG_LOOP_DERING
+ if (pc->dering_enabled) {
+ vp9_write_bit(w, 1);
+ vp9_write_literal(w, pc->dering_enabled - 1, 4);
+ } else {
+ vp9_write_bit(w, 0);
+ }
+#endif
+
// Write out loop filter deltas applied at the MB level based on mode or
// ref frame (if they are enabled).
vp9_write_bit(w, xd->mode_ref_lf_delta_enabled);
@@ -1354,6 +1410,24 @@
}
}
+static void put_delta_q(vp9_writer *bc, int delta_q) {
+ if (delta_q != 0) {
+ vp9_write_bit(bc, 1);
+ vp9_write_literal(bc, abs(delta_q), 4);
+ vp9_write_bit(bc, delta_q < 0);
+ } else {
+ vp9_write_bit(bc, 0);
+ }
+}
+
+static void encode_quantization(VP9_COMMON *pc, vp9_writer *w) {
+ vp9_write_literal(w, pc->base_qindex, QINDEX_BITS);
+ put_delta_q(w, pc->y_dc_delta_q);
+ put_delta_q(w, pc->uv_dc_delta_q);
+ put_delta_q(w, pc->uv_ac_delta_q);
+}
+
+
static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
int i, j;
VP9_COMMON *const pc = &cpi->common;
@@ -1495,27 +1569,9 @@
// lossless mode: note this needs to be before loopfilter
vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);
- // Encode the loop filter level and type
- vp9_write_literal(&header_bc, pc->filter_level, 6);
- vp9_write_literal(&header_bc, pc->sharpness_level, 3);
-#if CONFIG_LOOP_DERING
- if (pc->dering_enabled) {
- vp9_write_bit(&header_bc, 1);
- vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4);
- } else {
- vp9_write_bit(&header_bc, 0);
- }
-#endif
+ encode_loopfilter(pc, xd, &header_bc);
- encode_loopfilter(xd, &header_bc);
-
- // Frame Q baseline quantizer index
- vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
-
- // Transmit Dc, Second order and Uv quantizer delta information
- put_delta_q(&header_bc, pc->y_dc_delta_q);
- put_delta_q(&header_bc, pc->uv_dc_delta_q);
- put_delta_q(&header_bc, pc->uv_ac_delta_q);
+ encode_quantization(pc, &header_bc);
// When there is a key frame all reference buffers are updated using the new key frame
if (pc->frame_type != KEY_FRAME) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 6bc42c7..d3851b4 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -138,9 +138,14 @@
int optimize;
- // Structure to hold context for each of the 4 MBs within a SB:
- // when encoded as 4 independent MBs:
- PICK_MODE_CONTEXT sb8_context[4][4][4];
+ // TODO(jingning): Need to refactor the structure arrays that buffers the
+ // coding mode decisions of each partition type.
+#if CONFIG_AB4X4
+ PICK_MODE_CONTEXT ab4x4_context[4][4][4];
+ PICK_MODE_CONTEXT sb8x4_context[4][4][4];
+ PICK_MODE_CONTEXT sb4x8_context[4][4][4];
+#endif
+ PICK_MODE_CONTEXT sb8x8_context[4][4][4];
PICK_MODE_CONTEXT sb8x16_context[4][4][2];
PICK_MODE_CONTEXT sb16x8_context[4][4][2];
PICK_MODE_CONTEXT mb_context[4][4];
@@ -153,6 +158,13 @@
PICK_MODE_CONTEXT sb64_context;
int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+#if CONFIG_AB4X4
+ BLOCK_SIZE_TYPE b_partitioning[4][4][4];
+#endif
+ BLOCK_SIZE_TYPE mb_partitioning[4][4];
+ BLOCK_SIZE_TYPE sb_partitioning[4];
+ BLOCK_SIZE_TYPE sb64_partitioning;
+
void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 418f60e..954eefa 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -361,8 +361,8 @@
assert(mb_mode_index < MAX_MODES);
assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
#endif
- assert(mi->mbmi.sb_type == bsize);
+ assert(mi->mbmi.sb_type == bsize);
// Restore the coding context of the MB to that that was in place
// when the mode was picked for it
for (y = 0; y < bh; y++) {
@@ -539,15 +539,6 @@
x->e_mbd.plane[2].subsampling_y);
}
-static INLINE void set_partition_seg_context(VP9_COMP *cpi,
- int mi_row, int mi_col) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
- xd->above_seg_context = cm->above_seg_context + mi_col;
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
-}
-
static void set_offsets(VP9_COMP *cpi,
int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
MACROBLOCK *const x = &cpi->mb;
@@ -571,7 +562,7 @@
}
// partition contexts
- set_partition_seg_context(cpi, mi_row, mi_col);
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
// Activity map pointer
x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
@@ -649,6 +640,12 @@
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
+#if CONFIG_AB4X4
+ if (bsize < BLOCK_SIZE_SB8X8)
+ if (xd->ab_index != 0)
+ return;
+#endif
+
set_offsets(cpi, mi_row, mi_col, bsize);
xd->mode_info_context->mbmi.sb_type = bsize;
if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
@@ -727,10 +724,20 @@
} else if (bsize >= BLOCK_SIZE_MB16X16) {
xd->mb_index = idx;
} else {
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8)
+ xd->b_index = idx;
+ else
+ xd->ab_index = idx;
+#else
xd->b_index = idx;
+#endif
}
}
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
@@ -755,13 +762,71 @@
case BLOCK_SIZE_SB8X16:
return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
case BLOCK_SIZE_SB8X8:
- return &x->sb8_context[xd->sb_index][xd->mb_index][xd->b_index];
+ return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+#if CONFIG_AB4X4
+ case BLOCK_SIZE_SB8X4:
+ return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+ case BLOCK_SIZE_SB4X8:
+ return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+ case BLOCK_SIZE_AB4X4:
+ return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+#endif
default:
assert(0);
return NULL;
}
}
+static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
+ BLOCK_SIZE_TYPE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ switch (bsize) {
+ case BLOCK_SIZE_SB64X64:
+ return &x->sb64_partitioning;
+ case BLOCK_SIZE_SB32X32:
+ return &x->sb_partitioning[xd->sb_index];
+ case BLOCK_SIZE_MB16X16:
+ return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+#if CONFIG_AB4X4
+ case BLOCK_SIZE_SB8X8:
+ return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
+#endif
+ default:
+ assert(0);
+ return NULL;
+ }
+}
+
+static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+ ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+ PARTITION_CONTEXT sa[8],
+ PARTITION_CONTEXT sl[8],
+ BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int p;
+ int bwl = b_width_log2(bsize), bw = 1 << bwl;
+ int bhl = b_height_log2(bsize), bh = 1 << bhl;
+ int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+ int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+ for (p = 0; p < MAX_MB_PLANE; p++) {
+ vpx_memcpy(cm->above_context[p] +
+ ((mi_col * 2) >> xd->plane[p].subsampling_x),
+ a + bw * p,
+ sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+ vpx_memcpy(cm->left_context[p] +
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+ l + bh * p,
+ sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+ }
+ vpx_memcpy(cm->above_seg_context + mi_col, sa,
+ sizeof(PARTITION_CONTEXT) * mw);
+ vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
+ sizeof(PARTITION_CONTEXT) * mh);
+}
+
static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
int mi_row, int mi_col, int output_enabled,
BLOCK_SIZE_TYPE bsize, int sub_index) {
@@ -788,28 +853,45 @@
static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
int mi_row, int mi_col, int output_enabled,
- BLOCK_SIZE_TYPE level,
- BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4],
- BLOCK_SIZE_TYPE c3[4][4]
- ) {
+ BLOCK_SIZE_TYPE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1);
- const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+ BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+ const int bsl = mi_width_log2(bsize), bs = (1 << bsl) / 2;
+ int bwl, bhl;
int UNINITIALIZED_IS_SAFE(pl);
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- if (level > BLOCK_SIZE_SB8X8) {
- set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, level);
+#if CONFIG_AB4X4
+ c1 = BLOCK_SIZE_AB4X4;
+ if (bsize >= BLOCK_SIZE_SB8X8)
+#else
+ if (bsize > BLOCK_SIZE_SB8X8)
+#endif
+ {
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ c1 = *(get_sb_partitioning(x, bsize));
}
+ bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+
if (bsl == bwl && bsl == bhl) {
- if (output_enabled && level > BLOCK_SIZE_SB8X8)
+#if CONFIG_AB4X4
+ if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) {
+ if (bsize > BLOCK_SIZE_SB8X8 ||
+ (bsize == BLOCK_SIZE_SB8X8 && c1 == bsize))
+ cpi->partition_count[pl][PARTITION_NONE]++;
+ else
+ cpi->partition_count[pl][PARTITION_SPLIT]++;
+ }
+#else
+ if (output_enabled && bsize > BLOCK_SIZE_SB8X8)
cpi->partition_count[pl][PARTITION_NONE]++;
+#endif
encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
} else if (bsl == bhl && bsl > bwl) {
if (output_enabled)
@@ -826,14 +908,7 @@
int i;
assert(bwl < bsl && bhl < bsl);
- if (level == BLOCK_SIZE_SB64X64) {
- subsize = BLOCK_SIZE_SB32X32;
- } else if (level == BLOCK_SIZE_SB32X32) {
- subsize = BLOCK_SIZE_MB16X16;
- } else {
- assert(level == BLOCK_SIZE_MB16X16);
- subsize = BLOCK_SIZE_SB8X8;
- }
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
if (output_enabled)
cpi->partition_count[pl][PARTITION_SPLIT]++;
@@ -843,26 +918,226 @@
set_block_index(xd, i, subsize);
encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
- output_enabled, subsize,
- c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
+ output_enabled, subsize);
}
}
- if (level > BLOCK_SIZE_SB8X8 &&
- (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
- set_partition_seg_context(cpi, mi_row, mi_col);
- update_partition_context(xd, c1, level);
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8 &&
+ (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
+#else
+ if (bsize > BLOCK_SIZE_SB8X8 &&
+ (bsize == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
+#endif
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ update_partition_context(xd, c1, bsize);
}
}
-static void encode_sb_row(VP9_COMP *cpi,
- int mi_row,
- TOKENEXTRA **tp,
- int *totalrate) {
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previously rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
+ int mi_row, int mi_col,
+ BLOCK_SIZE_TYPE bsize,
+ int *rate, int *dist) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- int mi_col, pl;
+ int bsl = b_width_log2(bsize), bs = 1 << bsl;
+ int ms = bs / 2;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ PARTITION_CONTEXT sl[8], sa[8];
+ TOKENEXTRA *tp_orig = *tp;
+ int i, p, pl;
+ BLOCK_SIZE_TYPE subsize;
+ int srate = INT_MAX, sdist = INT_MAX;
+
+#if CONFIG_AB4X4
+ if (bsize < BLOCK_SIZE_SB8X8)
+ if (xd->ab_index != 0) {
+ *rate = 0;
+ *dist = 0;
+ return;
+ }
+#endif
+
+ assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+
+ // buffer the above/left context information of the block in search.
+ for (p = 0; p < MAX_MB_PLANE; ++p) {
+ vpx_memcpy(a + bs * p, cm->above_context[p] +
+ (mi_col * 2 >> xd->plane[p].subsampling_x),
+ sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x);
+ vpx_memcpy(l + bs * p, cm->left_context[p] +
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+ sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y);
+ }
+ vpx_memcpy(sa, cm->above_seg_context + mi_col,
+ sizeof(PARTITION_CONTEXT) * ms);
+ vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+ sizeof(PARTITION_CONTEXT) * ms);
+
+ // PARTITION_SPLIT
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+#else
+ if (bsize >= BLOCK_SIZE_MB16X16) {
+#endif
+ int r4 = 0, d4 = 0;
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ *(get_sb_partitioning(x, bsize)) = subsize;
+
+ for (i = 0; i < 4; ++i) {
+ int x_idx = (i & 1) * (ms >> 1);
+ int y_idx = (i >> 1) * (ms >> 1);
+ int r, d;
+
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ *(get_sb_index(xd, subsize)) = i;
+ rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &r, &d);
+
+ r4 += r;
+ d4 += d;
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+#if CONFIG_AB4X4
+ if (r4 < INT_MAX)
+ r4 += x->partition_cost[pl][PARTITION_SPLIT];
+#else
+ r4 += x->partition_cost[pl][PARTITION_SPLIT];
+#endif
+ assert(r4 >= 0);
+ assert(d4 >= 0);
+ srate = r4;
+ sdist = d4;
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ // TODO(jingning): need to enable 4x8 and 8x4 partition coding
+ // PARTITION_HORZ
+ if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) &&
+ (bsize >= BLOCK_SIZE_MB16X16)) {
+ int r2, d2;
+ int mb_skip = 0;
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+ get_block_context(x, subsize));
+
+ if (mi_row + ms <= cm->mi_rows) {
+ int r, d;
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+ get_block_context(x, subsize));
+ r2 += r;
+ d2 += d;
+ } else {
+ if (mi_row + (ms >> 1) != cm->mi_rows)
+ mb_skip = 1;
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ r2 += x->partition_cost[pl][PARTITION_HORZ];
+
+ if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+ RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ // PARTITION_VERT
+ if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) &&
+ (bsize >= BLOCK_SIZE_MB16X16)) {
+ int r2, d2;
+ int mb_skip = 0;
+ subsize = get_subsize(bsize, PARTITION_VERT);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+ get_block_context(x, subsize));
+ if (mi_col + ms <= cm->mi_cols) {
+ int r, d;
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+ get_block_context(x, subsize));
+ r2 += r;
+ d2 += d;
+ } else {
+ if (mi_col + (ms >> 1) != cm->mi_cols)
+ mb_skip = 1;
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ r2 += x->partition_cost[pl][PARTITION_VERT];
+
+ if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+ RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ // PARTITION_NONE
+ if (mi_row + ms <= cm->mi_rows && mi_col + ms <= cm->mi_cols) {
+ int r, d;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+ get_block_context(x, bsize));
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+#else
+ if (bsize >= BLOCK_SIZE_MB16X16) {
+#endif
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ r += x->partition_cost[pl][PARTITION_NONE];
+ }
+
+ if (RDCOST(x->rdmult, x->rddiv, r, d) <
+ RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r;
+ sdist = d;
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8)
+#else
+ if (bsize >= BLOCK_SIZE_MB16X16)
+#endif
+ *(get_sb_partitioning(x, bsize)) = bsize;
+ }
+ }
+
+ *rate = srate;
+ *dist = sdist;
+
+ if (srate < INT_MAX && sdist < INT_MAX)
+ encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+
+ if (bsize == BLOCK_SIZE_SB64X64) {
+ assert(tp_orig < *tp);
+ assert(srate < INT_MAX);
+ assert(sdist < INT_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+}
+
+static void encode_sb_row(VP9_COMP *cpi, int mi_row,
+ TOKENEXTRA **tp, int *totalrate) {
+ VP9_COMMON *const cm = &cpi->common;
+ int mi_col;
// Initialize the left context for the new SB row
vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
@@ -871,526 +1146,9 @@
// Code each SB in the row
for (mi_col = cm->cur_tile_mi_col_start;
mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
- int i, p;
- BLOCK_SIZE_TYPE mb_partitioning[4][4];
- BLOCK_SIZE_TYPE sb_partitioning[4];
- BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
- int sb64_rate = 0, sb64_dist = 0;
- int sb64_skip = 0;
- ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
- PARTITION_CONTEXT seg_l[64 / MI_SIZE], seg_a[64 / MI_SIZE];
- TOKENEXTRA *tp_orig = *tp;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- memcpy(a + 16 * p, cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
- memcpy(l + 16 * p, cm->left_context[p],
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
- }
- vpx_memcpy(&seg_a, cm->above_seg_context + mi_col, sizeof(seg_a));
- vpx_memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
-
- // FIXME(rbultje): this function should probably be rewritten to be
- // recursive at some point in the future.
- for (i = 0; i < 4; i++) {
- const int x_idx = (i & 1) << 2;
- const int y_idx = (i & 2) << 1;
- int sb32_rate = 0, sb32_dist = 0;
- int splitmodes_used = 0;
- int sb32_skip = 0;
- int j;
- ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
- PARTITION_CONTEXT sl32[32 / MI_SIZE], sa32[32 / MI_SIZE];
-
- sb_partitioning[i] = BLOCK_SIZE_MB16X16;
- if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
- continue;
-
- xd->sb_index = i;
-
- /* Function should not modify L & A contexts; save and restore on exit */
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(l2 + 8 * p,
- cm->left_context[p] +
- (y_idx * 2 >> xd->plane[p].subsampling_y),
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
- vpx_memcpy(a2 + 8 * p,
- cm->above_context[p] +
- ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
- }
- vpx_memcpy(&sa32, cm->above_seg_context + mi_col + x_idx, sizeof(sa32));
- vpx_memcpy(&sl32, cm->left_seg_context + y_idx, sizeof(sl32));
-
- /* Encode MBs in raster order within the SB */
- for (j = 0; j < 4; j++) {
- const int x_idx_m = x_idx + ((j & 1) << 1);
- const int y_idx_m = y_idx + ((j >> 1) << 1);
- int r, d;
- int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
- ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
- PARTITION_CONTEXT sl16[16 / MI_SIZE], sa16[16 / MI_SIZE];
-
- mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
-
- if (mi_row + y_idx_m >= cm->mi_rows ||
- mi_col + x_idx_m >= cm->mi_cols) {
- // MB lies outside frame, move on
- continue;
- }
-
- // Index of the MB in the SB 0..3
- xd->mb_index = j;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(l3 + 4 * p,
- cm->left_context[p] +
- (y_idx_m * 2 >> xd->plane[p].subsampling_y),
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
- vpx_memcpy(a3 + 4 * p,
- cm->above_context[p] +
- ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
- }
- vpx_memcpy(&sa16, cm->above_seg_context + mi_col + x_idx_m,
- sizeof(sa16));
- vpx_memcpy(&sl16, cm->left_seg_context + y_idx_m, sizeof(sl16));
-
- for (k = 0; k < 4; k++) {
- xd->b_index = k;
-
- // try 8x8 coding
- pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
- mi_col + x_idx_m + (k & 1),
- tp, &r, &d, BLOCK_SIZE_SB8X8,
- &x->sb8_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- mb16_rate += r;
- mb16_dist += d;
- update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
- [xd->b_index],
- BLOCK_SIZE_SB8X8, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx_m + (k >> 1),
- mi_col + x_idx_m + (k & 1),
- BLOCK_SIZE_SB8X8);
- }
- set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
- pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
- mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx_m * 2 >> xd->plane[p].subsampling_y),
- l3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
- a3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
- }
- vpx_memcpy(cm->above_seg_context + mi_col + x_idx_m,
- sa16, sizeof(sa16));
- vpx_memcpy(cm->left_seg_context + y_idx_m, sl16, sizeof(sl16));
-
- // try 8x16 coding
- r2 = 0;
- d2 = 0;
- xd->b_index = 0;
- pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
- tp, &r, &d, BLOCK_SIZE_SB8X16,
- &x->sb8x16_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- r2 += r;
- d2 += d;
- update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
- [xd->b_index],
- BLOCK_SIZE_SB8X16, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx_m, mi_col + x_idx_m,
- BLOCK_SIZE_SB8X16);
- xd->b_index = 1;
- pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
- tp, &r, &d, BLOCK_SIZE_SB8X16,
- &x->sb8x16_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- r2 += r;
- d2 += d;
- set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
- pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
- r2 += x->partition_cost[pl][PARTITION_VERT];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
- RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
- mb16_rate = r2;
- mb16_dist = d2;
- mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
- }
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx_m * 2 >> xd->plane[p].subsampling_y),
- l3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
- a3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
- }
-
- // try 16x8 coding
- r2 = 0;
- d2 = 0;
- xd->b_index = 0;
- pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
- tp, &r, &d, BLOCK_SIZE_SB16X8,
- &x->sb16x8_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- r2 += r;
- d2 += d;
- update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
- [xd->b_index],
- BLOCK_SIZE_SB16X8, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx_m, mi_col + x_idx_m,
- BLOCK_SIZE_SB16X8);
- xd->b_index = 1;
- pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
- tp, &r, &d, BLOCK_SIZE_SB16X8,
- &x->sb16x8_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- r2 += r;
- d2 += d;
- set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
- pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
- r2 += x->partition_cost[pl][PARTITION_HORZ];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
- RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
- mb16_rate = r2;
- mb16_dist = d2;
- mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
- }
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx_m * 2 >> xd->plane[p].subsampling_y),
- l3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
- a3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
- }
-
- // try as 16x16
- pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
- tp, &r, &d, BLOCK_SIZE_MB16X16,
- &x->mb_context[xd->sb_index][xd->mb_index]);
- set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
- pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
- r += x->partition_cost[pl][PARTITION_NONE];
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
- mb16_rate = r;
- mb16_dist = d;
- mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
- }
- sb32_rate += mb16_rate;
- sb32_dist += mb16_dist;
-
- // Dummy encode, do not do the tokenization
- encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
- BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
- }
-
- /* Restore L & A coding context to those in place on entry */
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx * 2 >> xd->plane[p].subsampling_y),
- l2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
- a2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
- }
- // restore partition information context
- vpx_memcpy(cm->above_seg_context + mi_col + x_idx, sa32, sizeof(sa32));
- vpx_memcpy(cm->left_seg_context + y_idx, sl32, sizeof(sl32));
-
- set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
- sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
- if (cpi->sf.splitmode_breakout) {
- sb32_skip = splitmodes_used;
- sb64_skip += splitmodes_used;
- }
-
- // check 32x16
- if (mi_col + x_idx + 4 <= cm->mi_cols) {
- int r, d;
-
- xd->mb_index = 0;
- pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
- tp, &r, &d, BLOCK_SIZE_SB32X16,
- &x->sb32x16_context[xd->sb_index][xd->mb_index]);
- if (mi_row + y_idx + 2 < cm->mi_rows) {
- int r2, d2;
-
- update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
- BLOCK_SIZE_SB32X16, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx, mi_col + x_idx,
- BLOCK_SIZE_SB32X16);
- xd->mb_index = 1;
- pick_sb_modes(cpi, mi_row + y_idx + 2,
- mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
- &x->sb32x16_context[xd->sb_index][xd->mb_index]);
- r += r2;
- d += d2;
- }
-
- set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
- r += x->partition_cost[pl][PARTITION_HORZ];
-
- /* is this better than MB coding? */
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
- sb32_rate = r;
- sb32_dist = d;
- sb_partitioning[i] = BLOCK_SIZE_SB32X16;
- }
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx * 2 >> xd->plane[p].subsampling_y),
- l2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
- a2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
- }
- }
-
- // check 16x32
- if (mi_row + y_idx + 4 <= cm->mi_rows) {
- int r, d;
-
- xd->mb_index = 0;
- pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
- tp, &r, &d, BLOCK_SIZE_SB16X32,
- &x->sb16x32_context[xd->sb_index][xd->mb_index]);
- if (mi_col + x_idx + 2 < cm->mi_cols) {
- int r2, d2;
-
- update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
- BLOCK_SIZE_SB16X32, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx, mi_col + x_idx,
- BLOCK_SIZE_SB16X32);
- xd->mb_index = 1;
- pick_sb_modes(cpi, mi_row + y_idx,
- mi_col + x_idx + 2,
- tp, &r2, &d2, BLOCK_SIZE_SB16X32,
- &x->sb16x32_context[xd->sb_index][xd->mb_index]);
- r += r2;
- d += d2;
- }
-
- set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
- r += x->partition_cost[pl][PARTITION_VERT];
-
- /* is this better than MB coding? */
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
- sb32_rate = r;
- sb32_dist = d;
- sb_partitioning[i] = BLOCK_SIZE_SB16X32;
- }
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx * 2 >> xd->plane[p].subsampling_y),
- l2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
- a2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
- }
- }
-
- if (!sb32_skip &&
- mi_col + x_idx + 4 <= cm->mi_cols &&
- mi_row + y_idx + 4 <= cm->mi_rows) {
- int r, d;
-
- /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
- pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
- tp, &r, &d, BLOCK_SIZE_SB32X32,
- &x->sb32_context[xd->sb_index]);
-
- set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
- r += x->partition_cost[pl][PARTITION_NONE];
-
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
- sb32_rate = r;
- sb32_dist = d;
- sb_partitioning[i] = BLOCK_SIZE_SB32X32;
- }
- }
-
- // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
- if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) {
- ++sb64_skip;
- }
-
- sb64_rate += sb32_rate;
- sb64_dist += sb32_dist;
-
- /* Encode SB using best computed mode(s) */
- // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
- // for each level that we go up, we can just keep tokens and recon
- // pixels of the lower level; also, inverting SB/MB order (big->small
- // instead of small->big) means we can use as threshold for small, which
- // may enable breakouts if RD is not good enough (i.e. faster)
- encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
- BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
- NULL);
- }
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- memcpy(cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- a + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
- memcpy(cm->left_context[p], l + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
- }
- memcpy(cm->above_seg_context + mi_col, &seg_a, sizeof(seg_a));
- memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
-
- set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
- sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
- // check 64x32
- if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) {
- int r, d;
-
- xd->sb_index = 0;
- pick_sb_modes(cpi, mi_row, mi_col,
- tp, &r, &d, BLOCK_SIZE_SB64X32,
- &x->sb64x32_context[xd->sb_index]);
- if (mi_row + 4 != cm->mi_rows) {
- int r2, d2;
-
- update_state(cpi, &x->sb64x32_context[xd->sb_index],
- BLOCK_SIZE_SB64X32, 0);
- encode_superblock(cpi, tp,
- 0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
- xd->sb_index = 1;
- pick_sb_modes(cpi, mi_row + 4, mi_col,
- tp, &r2, &d2, BLOCK_SIZE_SB64X32,
- &x->sb64x32_context[xd->sb_index]);
- r += r2;
- d += d2;
- }
-
- set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
- r += x->partition_cost[pl][PARTITION_HORZ];
-
- /* is this better than MB coding? */
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
- sb64_rate = r;
- sb64_dist = d;
- sb64_partitioning = BLOCK_SIZE_SB64X32;
- }
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- memcpy(cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- a + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
- memcpy(cm->left_context[p], l + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
- }
- }
-
- // check 32x64
- if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) {
- int r, d;
-
- xd->sb_index = 0;
- pick_sb_modes(cpi, mi_row, mi_col,
- tp, &r, &d, BLOCK_SIZE_SB32X64,
- &x->sb32x64_context[xd->sb_index]);
- if (mi_col + 4 != cm->mi_cols) {
- int r2, d2;
-
- update_state(cpi, &x->sb32x64_context[xd->sb_index],
- BLOCK_SIZE_SB32X64, 0);
- encode_superblock(cpi, tp,
- 0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
- xd->sb_index = 1;
- pick_sb_modes(cpi, mi_row, mi_col + 4,
- tp, &r2, &d2, BLOCK_SIZE_SB32X64,
- &x->sb32x64_context[xd->sb_index]);
- r += r2;
- d += d2;
- }
-
- set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
- r += x->partition_cost[pl][PARTITION_VERT];
-
- /* is this better than MB coding? */
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
- sb64_rate = r;
- sb64_dist = d;
- sb64_partitioning = BLOCK_SIZE_SB32X64;
- }
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- memcpy(cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- a + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
- memcpy(cm->left_context[p], l + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
- }
- }
-
- if (!sb64_skip &&
- mi_col + 8 <= cm->mi_cols &&
- mi_row + 8 <= cm->mi_rows) {
- int r, d;
-
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
- BLOCK_SIZE_SB64X64, &x->sb64_context);
-
- set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
- r += x->partition_cost[pl][PARTITION_NONE];
-
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
- sb64_rate = r;
- sb64_dist = d;
- sb64_partitioning = BLOCK_SIZE_SB64X64;
- }
- }
-
- assert(tp_orig == *tp);
- encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
- sb64_partitioning, sb_partitioning, mb_partitioning);
- assert(tp_orig < *tp);
+ int dummy_rate, dummy_dist;
+ rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist);
}
}
@@ -1559,9 +1317,8 @@
vp9_get_tile_col_offsets(cm, tile_col);
for (mi_row = cm->cur_tile_mi_row_start;
mi_row < cm->cur_tile_mi_row_end;
- mi_row += 8) {
+ mi_row += 8)
encode_sb_row(cpi, mi_row, &tp, &totalrate);
- }
cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
assert(tp - cpi->tok <=
get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -1901,7 +1658,11 @@
}
#endif
+#if CONFIG_AB4X4
+ if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+#else
if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8) {
+#endif
++cpi->sb_ymode_count[m];
} else {
++cpi->ymode_count[m];
@@ -1986,13 +1747,17 @@
vp9_update_zbin_extra(cpi, x);
}
+#if CONFIG_AB4X4
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+ bsize < BLOCK_SIZE_SB8X8) {
+#else
if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
assert(bsize == BLOCK_SIZE_SB8X8 &&
xd->mode_info_context->mbmi.txfm_size == TX_4X4);
-
- vp9_encode_intra4x4mby(x, bsize);
- vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
- vp9_encode_sbuv(cm, x, bsize);
+#endif
+ vp9_encode_intra4x4mby(x, BLOCK_SIZE_SB8X8);
+ vp9_build_intra_predictors_sbuv_s(&x->e_mbd, BLOCK_SIZE_SB8X8);
+ vp9_encode_sbuv(cm, x, BLOCK_SIZE_SB8X8);
if (output_enabled)
sum_intra_stats(cpi, x);
@@ -2028,15 +1793,22 @@
? &cpi->common.yv12_fb[second_ref_fb_idx] : NULL,
mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col,
+ (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
}
+#if CONFIG_AB4X4
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+ bsize < BLOCK_SIZE_SB8X8) {
+#else
if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
assert(bsize == BLOCK_SIZE_SB8X8);
- vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
+#endif
+ vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, BLOCK_SIZE_SB8X8);
} else if (!x->skip) {
- vp9_encode_sb(cm, x, bsize);
- vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
+ vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled,
+ (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
} else {
// FIXME(rbultje): not tile-aware (mi - 1)
int mb_skip_context =
@@ -2045,7 +1817,8 @@
xd->mode_info_context->mbmi.mb_skip_coeff = 1;
if (output_enabled)
cpi->skip_true_count[mb_skip_context]++;
- vp9_reset_sb_tokens_context(xd, bsize);
+ vp9_reset_sb_tokens_context(xd,
+ (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
}
// copy skip flag on all mb_mode_info contexts in this SB
@@ -2075,8 +1848,12 @@
sz = TX_16X16;
if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
sz = TX_8X8;
+#if CONFIG_AB4X4
+ if (sz == TX_8X8 && bsize < BLOCK_SIZE_SB8X8)
+#else
if (sz == TX_8X8 && (xd->mode_info_context->mbmi.mode == SPLITMV ||
xd->mode_info_context->mbmi.mode == I4X4_PRED))
+#endif
sz = TX_4X4;
for (y = 0; y < bh; y++) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4665fcc..221de74 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -139,6 +139,7 @@
const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
block, 2 * tx_size);
const int16_t *dequant_ptr = xd->plane[plane].dequant;
+ const uint8_t * band_translate;
assert((!type && !plane) || (type && plane));
dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
@@ -149,23 +150,27 @@
const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
default_eob = 16;
scan = get_scan_4x4(tx_type);
+ band_translate = vp9_coefband_trans_4x4;
break;
}
case TX_8X8: {
const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
scan = get_scan_8x8(tx_type);
default_eob = 64;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_16X16: {
const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
scan = get_scan_16x16(tx_type);
default_eob = 256;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_32X32:
scan = vp9_default_zig_zag1d_32x32;
default_eob = 1024;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
assert(eob <= default_eob);
@@ -204,7 +209,7 @@
t0 = (vp9_dct_value_tokens_ptr + x)->token;
/* Consider both possible successor states. */
if (next < default_eob) {
- band = get_coef_band(scan, tx_size, i + 1);
+ band = get_coef_band(band_translate, i + 1);
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
pad, default_eob);
rate0 +=
@@ -254,7 +259,7 @@
t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
}
if (next < default_eob) {
- band = get_coef_band(scan, tx_size, i + 1);
+ band = get_coef_band(band_translate, i + 1);
if (t0 != DCT_EOB_TOKEN) {
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
pad, default_eob);
@@ -291,7 +296,7 @@
* add a new trellis node, but we do need to update the costs.
*/
else {
- band = get_coef_band(scan, tx_size, i + 1);
+ band = get_coef_band(band_translate, i + 1);
t0 = tokens[next][0].token;
t1 = tokens[next][1].token;
/* Update the cost of each path if we're past the EOB token. */
@@ -310,7 +315,7 @@
}
/* Now pick the best path through the whole trellis. */
- band = get_coef_band(scan, tx_size, i + 1);
+ band = get_coef_band(band_translate, i + 1);
pt = combine_entropy_contexts(*a, *l);
rate0 = tokens[next][0].rate;
rate1 = tokens[next][1].rate;
@@ -420,6 +425,7 @@
VP9_COMMON *cm;
MACROBLOCK *x;
struct optimize_ctx *ctx;
+ int *wip_txfrm_size; // for "work in progress" only... will remove once done
};
static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -488,6 +494,7 @@
int ss_txfrm_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK* const x = args->x;
+ int *wip_txfrm_size = args->wip_txfrm_size;
MACROBLOCKD* const xd = &x->e_mbd;
const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -495,6 +502,10 @@
int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
raster_block,
xd->plane[plane].diff);
+ uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
+ raster_block,
+ xd->plane[plane].dst.buf,
+ xd->plane[plane].dst.stride);
TX_TYPE tx_type = DCT_DCT;
xform_quant(plane, block, bsize, ss_txfrm_size, arg);
@@ -504,18 +515,21 @@
switch (ss_txfrm_size / 2) {
case TX_32X32:
- vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw * 2);
+ vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride);
+ *wip_txfrm_size = 32;
break;
case TX_16X16:
tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
if (tx_type == DCT_DCT) {
- vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw * 2);
+ vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride);
} else {
- vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw, tx_type);
+ vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride,
+ tx_type);
}
+ *wip_txfrm_size = 16;
break;
case TX_8X8:
tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@@ -526,6 +540,7 @@
vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw, tx_type);
}
+ *wip_txfrm_size = 8;
break;
case TX_4X4:
tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@@ -539,6 +554,7 @@
vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw, tx_type);
}
+ *wip_txfrm_size = 4;
break;
}
}
@@ -546,7 +562,7 @@
void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
- struct encode_b_args arg = {cm, x, NULL};
+ struct encode_b_args arg = {cm, x, NULL, NULL};
foreach_transformed_block_in_plane(xd, bsize, 0,
xform_quant, &arg);
@@ -555,7 +571,7 @@
void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
- struct encode_b_args arg = {cm, x, NULL};
+ struct encode_b_args arg = {cm, x, NULL, NULL};
foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
}
@@ -564,7 +580,8 @@
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ int wip_txfrm_size = 0;
+ struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
vp9_subtract_sby(x, bsize);
if (x->optimize)
@@ -572,15 +589,16 @@
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
-
- vp9_recon_sby(xd, bsize);
+ if (wip_txfrm_size < 32)
+ vp9_recon_sby(xd, bsize);
}
void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ int wip_txfrm_size = 0;
+ struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
vp9_subtract_sbuv(x, bsize);
if (x->optimize)
@@ -588,20 +606,35 @@
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
- vp9_recon_sbuv(xd, bsize);
+ if (wip_txfrm_size < 16)
+ vp9_recon_sbuv(xd, bsize);
}
void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ int wip_txfrm_size = 0;
+ struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
vp9_subtract_sb(x, bsize);
if (x->optimize)
vp9_optimize_init(xd, bsize, &ctx);
-
+#if 0
foreach_transformed_block(xd, bsize, encode_block, &arg);
vp9_recon_sb(xd, bsize);
+#else
+ // wip version... will use foreach_transformed_block when done
+ foreach_transformed_block_in_plane(xd, bsize, 0,
+ encode_block, &arg);
+ if (wip_txfrm_size < 16)
+ vp9_recon_sby(xd, bsize);
+ wip_txfrm_size = 0;
+
+ foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
+
+ if (wip_txfrm_size < 16)
+ vp9_recon_sbuv(xd, bsize);
+#endif
}
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index ff0725f..0561efe 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -47,7 +47,7 @@
#define KF_MB_INTRA_MIN 150
#define GF_MB_INTRA_MIN 100
-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
@@ -78,8 +78,8 @@
// Resets the first pass file to the given position using a relative seek from the current position
-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
- cpi->twopass.stats_in = Position;
+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
+ cpi->twopass.stats_in = position;
}
static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
@@ -252,17 +252,11 @@
// Calculate a modified Error used in distributing bits between easier and harder frames
static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- double av_err = (cpi->twopass.total_stats.ssim_weighted_pred_err /
- cpi->twopass.total_stats.count);
- double this_err = this_frame->ssim_weighted_pred_err;
- double modified_err;
-
- if (this_err > av_err)
- modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
- else
- modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
-
- return modified_err;
+ const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
+ const double av_err = stats->ssim_weighted_pred_err / stats->count;
+ const double this_err = this_frame->ssim_weighted_pred_err;
+ return av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
+ this_err > av_err ? POW1 : POW2);
}
static const double weight_table[256] = {
@@ -328,20 +322,14 @@
static int frame_max_bits(VP9_COMP *cpi) {
// Max allocation for a single frame based on the max section guidelines
// passed in and how many bits are left.
- int max_bits;
-
// For VBR base this on the bits and frames left plus the
// two_pass_vbrmax_section rate passed in by the user.
- max_bits = (int) (((double) cpi->twopass.bits_left
- / (cpi->twopass.total_stats.count - (double) cpi->common
- .current_video_frame))
- * ((double) cpi->oxcf.two_pass_vbrmax_section / 100.0));
+ const double max_bits = (1.0 * cpi->twopass.bits_left /
+ (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
+ (cpi->oxcf.two_pass_vbrmax_section / 100.0);
// Trap case where we are out of bits.
- if (max_bits < 0)
- max_bits = 0;
-
- return max_bits;
+ return MAX((int)max_bits, 0);
}
void vp9_init_first_pass(VP9_COMP *cpi) {
@@ -534,6 +522,8 @@
xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
xd->left_available = (mb_col != 0);
+ xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
+
// do intra 16x16 prediction
this_error = vp9_encode_intra(cpi, x, use_dc_pred);
@@ -632,7 +622,7 @@
vp9_build_inter_predictors_sby(xd, mb_row << 1,
mb_col << 1,
BLOCK_SIZE_MB16X16);
- vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
+ vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
sum_mvr += mv.as_mv.row;
sum_mvr_abs += abs(mv.as_mv.row);
sum_mvc += mv.as_mv.col;
@@ -854,26 +844,18 @@
double err_divisor,
double pt_low,
double pt_high,
- int Q) {
- double power_term;
- double error_term = err_per_mb / err_divisor;
- double correction_factor;
+ int q) {
+ const double error_term = err_per_mb / err_divisor;
// Adjustment based on actual quantizer to power term.
- power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
- power_term = (power_term > pt_high) ? pt_high : power_term;
+ const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.01 + pt_low,
+ pt_high);
// Calculate correction factor
if (power_term < 1.0)
assert(error_term >= 0.0);
- correction_factor = pow(error_term, power_term);
- // Clip range
- correction_factor =
- (correction_factor < 0.05)
- ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
-
- return correction_factor;
+ return fclamp(pow(error_term, power_term), 0.05, 5.0);
}
// Given a current maxQ value sets a range for future values.
@@ -882,10 +864,8 @@
// (now uses the actual quantizer) but has not been tuned.
static void adjust_maxq_qrange(VP9_COMP *cpi) {
int i;
- double q;
-
// Set the max corresponding to cpi->avg_q * 2.0
- q = cpi->avg_q * 2.0;
+ double q = cpi->avg_q * 2.0;
cpi->twopass.maxq_max_limit = cpi->worst_quality;
for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
cpi->twopass.maxq_max_limit = i;
@@ -906,12 +886,11 @@
static int estimate_max_q(VP9_COMP *cpi,
FIRSTPASS_STATS *fpstats,
int section_target_bandwitdh) {
- int Q;
+ int q;
int num_mbs = cpi->common.MBs;
int target_norm_bits_per_mb;
- double section_err = (fpstats->coded_error / fpstats->count);
- double sr_err_diff;
+ double section_err = fpstats->coded_error / fpstats->count;
double sr_correction;
double err_per_mb = section_err / num_mbs;
double err_correction_factor;
@@ -920,92 +899,74 @@
if (section_target_bandwitdh <= 0)
return cpi->twopass.maxq_max_limit; // Highest value allowed
- target_norm_bits_per_mb =
- (section_target_bandwitdh < (1 << 20))
- ? (512 * section_target_bandwitdh) / num_mbs
- : 512 * (section_target_bandwitdh / num_mbs);
+ target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
+ ? (512 * section_target_bandwitdh) / num_mbs
+ : 512 * (section_target_bandwitdh / num_mbs);
// Look at the drop in prediction quality between the last frame
// and the GF buffer (which contained an older frame).
if (fpstats->sr_coded_error > fpstats->coded_error) {
- sr_err_diff =
- (fpstats->sr_coded_error - fpstats->coded_error) /
- (fpstats->count * cpi->common.MBs);
- sr_correction = (sr_err_diff / 32.0);
- sr_correction = pow(sr_correction, 0.25);
- if (sr_correction < 0.75)
- sr_correction = 0.75;
- else if (sr_correction > 1.25)
- sr_correction = 1.25;
+ double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) /
+ (fpstats->count * cpi->common.MBs);
+ sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25);
} else {
sr_correction = 0.75;
}
// Calculate a corrective factor based on a rolling ratio of bits spent
// vs target bits
- if ((cpi->rolling_target_bits > 0) &&
- (cpi->active_worst_quality < cpi->worst_quality)) {
- double rolling_ratio;
-
- rolling_ratio = (double)cpi->rolling_actual_bits /
- (double)cpi->rolling_target_bits;
+ if (cpi->rolling_target_bits > 0 &&
+ cpi->active_worst_quality < cpi->worst_quality) {
+ double rolling_ratio = (double)cpi->rolling_actual_bits /
+ (double)cpi->rolling_target_bits;
if (rolling_ratio < 0.95)
cpi->twopass.est_max_qcorrection_factor -= 0.005;
else if (rolling_ratio > 1.05)
cpi->twopass.est_max_qcorrection_factor += 0.005;
- cpi->twopass.est_max_qcorrection_factor =
- (cpi->twopass.est_max_qcorrection_factor < 0.1)
- ? 0.1
- : (cpi->twopass.est_max_qcorrection_factor > 10.0)
- ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+ cpi->twopass.est_max_qcorrection_factor = fclamp(
+ cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0);
}
// Corrections for higher compression speed settings
// (reduced compression expected)
- if (cpi->compressor_speed == 1) {
- if (cpi->oxcf.cpu_used <= 5)
- speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
- else
- speed_correction = 1.25;
- }
+ if (cpi->compressor_speed == 1)
+ speed_correction = cpi->oxcf.cpu_used <= 5 ?
+ 1.04 + (cpi->oxcf.cpu_used * 0.04) :
+ 1.25;
// Try and pick a max Q that will be high enough to encode the
// content at the given rate.
- for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
+ for (q = cpi->twopass.maxq_min_limit; q < cpi->twopass.maxq_max_limit; q++) {
int bits_per_mb_at_this_q;
- err_correction_factor =
- calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
- sr_correction * speed_correction *
- cpi->twopass.est_max_qcorrection_factor;
+ err_correction_factor = calc_correction_factor(err_per_mb,
+ ERR_DIVISOR, 0.4, 0.90, q) *
+ sr_correction * speed_correction *
+ cpi->twopass.est_max_qcorrection_factor;
-
- bits_per_mb_at_this_q =
- vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+ bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
+ err_correction_factor);
if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
break;
}
// Restriction on active max q for constrained quality mode.
- if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
- (Q < cpi->cq_target_quality)) {
- Q = cpi->cq_target_quality;
- }
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+ q < cpi->cq_target_quality)
+ q = cpi->cq_target_quality;
// Adjust maxq_min_limit and maxq_max_limit limits based on
// average q observed in clip for non kf/gf/arf frames
// Give average a chance to settle though.
// PGW TODO.. This code is broken for the extended Q range
- if ((cpi->ni_frames >
- ((int)cpi->twopass.total_stats.count >> 8)) &&
- (cpi->ni_frames > 25)) {
+ if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) &&
+ cpi->ni_frames > 25)
adjust_maxq_qrange(cpi);
- }
- return Q;
+ return q;
}
// For cq mode estimate a cq level that matches the observed
@@ -1013,7 +974,7 @@
static int estimate_cq(VP9_COMP *cpi,
FIRSTPASS_STATS *fpstats,
int section_target_bandwitdh) {
- int Q;
+ int q;
int num_mbs = cpi->common.MBs;
int target_norm_bits_per_mb;
@@ -1064,29 +1025,29 @@
clip_iifactor = 0.80;
// Try and pick a Q that can encode the content at the given rate.
- for (Q = 0; Q < MAXQ; Q++) {
+ for (q = 0; q < MAXQ; q++) {
int bits_per_mb_at_this_q;
// Error per MB based correction factor
err_correction_factor =
- calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
+ calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) *
sr_correction * speed_correction * clip_iifactor;
bits_per_mb_at_this_q =
- vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+ vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
break;
}
// Clip value to range "best allowed to (worst allowed - 1)"
- Q = select_cq_level(Q);
- if (Q >= cpi->worst_quality)
- Q = cpi->worst_quality - 1;
- if (Q < cpi->best_quality)
- Q = cpi->best_quality;
+ q = select_cq_level(q);
+ if (q >= cpi->worst_quality)
+ q = cpi->worst_quality - 1;
+ if (q < cpi->best_quality)
+ q = cpi->best_quality;
- return Q;
+ return q;
}
@@ -1117,9 +1078,8 @@
// encoded in the second pass is a guess. However the sum duration is not.
// Its calculated based on the actual durations of all frames from the first
// pass.
- vp9_new_frame_rate(cpi,
- 10000000.0 * cpi->twopass.total_stats.count /
- cpi->twopass.total_stats.duration);
+ vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+ cpi->twopass.total_stats.duration);
cpi->output_frame_rate = cpi->oxcf.frame_rate;
cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
@@ -1191,9 +1151,8 @@
// Look at the observed drop in prediction quality between the last frame
// and the GF buffer (which contains an older frame).
- mb_sr_err_diff =
- (next_frame->sr_coded_error - next_frame->coded_error) /
- (cpi->common.MBs);
+ mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) /
+ cpi->common.MBs;
if (mb_sr_err_diff <= 512.0) {
second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
second_ref_decay = pow(second_ref_decay, 0.5);
@@ -1225,9 +1184,9 @@
// Break clause to detect very still sections after motion
// For example a static image after a fade or other transition
// instead of a clean scene cut.
- if ((frame_interval > MIN_GF_INTERVAL) &&
- (loop_decay_rate >= 0.999) &&
- (last_decay_rate < 0.9)) {
+ if (frame_interval > MIN_GF_INTERVAL &&
+ loop_decay_rate >= 0.999 &&
+ last_decay_rate < 0.9) {
int j;
FIRSTPASS_STATS *position = cpi->twopass.stats_in;
FIRSTPASS_STATS tmp_next_frame;
@@ -1271,10 +1230,9 @@
// are reasonably well predicted by an earlier (pre flash) frame.
// The recovery after a flash is indicated by a high pcnt_second_ref
// comapred to pcnt_inter.
- if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
- (next_frame.pcnt_second_ref >= 0.5)) {
+ if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
+ next_frame.pcnt_second_ref >= 0.5)
flash_detected = 1;
- }
}
return flash_detected;
@@ -1356,13 +1314,9 @@
return frame_boost;
}
-static int calc_arf_boost(
- VP9_COMP *cpi,
- int offset,
- int f_frames,
- int b_frames,
- int *f_boost,
- int *b_boost) {
+static int calc_arf_boost(VP9_COMP *cpi, int offset,
+ int f_frames, int b_frames,
+ int *f_boost, int *b_boost) {
FIRSTPASS_STATS this_frame;
int i;
@@ -1392,8 +1346,7 @@
// Cumulative effect of prediction quality decay
if (!flash_detected) {
- decay_accumulator =
- decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+ decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
? MIN_DECAY_FACTOR : decay_accumulator;
}
@@ -1429,10 +1382,9 @@
// Cumulative effect of prediction quality decay
if (!flash_detected) {
- decay_accumulator =
- decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+ decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
- ? MIN_DECAY_FACTOR : decay_accumulator;
+ ? MIN_DECAY_FACTOR : decay_accumulator;
}
boost_score += (decay_accumulator *
@@ -1871,26 +1823,20 @@
for (i = 0;
i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME);
++i) {
- int boost;
int allocation_chunks;
- int Q =
- (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+ int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME]
+ : cpi->oxcf.fixed_q;
int gf_bits;
- boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
+ int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100;
// Set max and minimum boost and hence minimum allocation
- if (boost > ((cpi->baseline_gf_interval + 1) * 200))
- boost = ((cpi->baseline_gf_interval + 1) * 200);
- else if (boost < 125)
- boost = 125;
+ boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200);
if (cpi->source_alt_ref_pending && i == 0)
- allocation_chunks =
- ((cpi->baseline_gf_interval + 1) * 100) + boost;
+ allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost;
else
- allocation_chunks =
- (cpi->baseline_gf_interval * 100) + (boost - 100);
+ allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100);
// Prevent overflow
if (boost > 1023) {
@@ -1901,41 +1847,34 @@
// Calculate the number of bits to be spent on the gf or arf based on
// the boost number
- gf_bits = (int)((double)boost *
- (cpi->twopass.gf_group_bits /
- (double)allocation_chunks));
+ gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /
+ (double)allocation_chunks));
// If the frame that is to be boosted is simpler than the average for
// the gf/arf group then use an alternative calculation
// based on the error score of the frame itself
if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
- double alt_gf_grp_bits;
- int alt_gf_bits;
-
- alt_gf_grp_bits =
+ double alt_gf_grp_bits =
(double)cpi->twopass.kf_group_bits *
(mod_frame_err * (double)cpi->baseline_gf_interval) /
DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
- alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
+ int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
(double)allocation_chunks));
- if (gf_bits > alt_gf_bits) {
+ if (gf_bits > alt_gf_bits)
gf_bits = alt_gf_bits;
- }
}
// Else if it is harder than other frames in the group make sure it at
// least receives an allocation in keeping with its relative error
// score, otherwise it may be worse off than an "un-boosted" frame
else {
- int alt_gf_bits =
- (int)((double)cpi->twopass.kf_group_bits *
- mod_frame_err /
- DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
+ int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
+ mod_frame_err /
+ DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
- if (alt_gf_bits > gf_bits) {
+ if (alt_gf_bits > gf_bits)
gf_bits = alt_gf_bits;
- }
}
// Dont allow a negative value for gf_bits
@@ -1983,14 +1922,11 @@
// despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
// calculation of alt_extra_bits.
if (cpi->baseline_gf_interval >= 3) {
- int boost = (cpi->source_alt_ref_pending)
- ? b_boost : cpi->gfu_boost;
+ const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost;
if (boost >= 150) {
- int pct_extra;
int alt_extra_bits;
-
- pct_extra = (boost - 100) / 50;
+ int pct_extra = (boost - 100) / 50;
pct_extra = (pct_extra > 20) ? 20 : pct_extra;
alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);
@@ -2071,33 +2007,21 @@
// Make a damped adjustment to the active max q.
static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
int i;
- int ret_val = new_maxqi;
- double old_q;
- double new_q;
- double target_q;
-
- old_q = vp9_convert_qindex_to_q(old_maxqi);
- new_q = vp9_convert_qindex_to_q(new_maxqi);
-
- target_q = ((old_q * 7.0) + new_q) / 8.0;
+ const double old_q = vp9_convert_qindex_to_q(old_maxqi);
+ const double new_q = vp9_convert_qindex_to_q(new_maxqi);
+ const double target_q = ((old_q * 7.0) + new_q) / 8.0;
if (target_q > old_q) {
- for (i = old_maxqi; i <= new_maxqi; i++) {
- if (vp9_convert_qindex_to_q(i) >= target_q) {
- ret_val = i;
- break;
- }
- }
+ for (i = old_maxqi; i <= new_maxqi; i++)
+ if (vp9_convert_qindex_to_q(i) >= target_q)
+ return i;
} else {
- for (i = old_maxqi; i >= new_maxqi; i--) {
- if (vp9_convert_qindex_to_q(i) <= target_q) {
- ret_val = i;
- break;
- }
- }
+ for (i = old_maxqi; i >= new_maxqi; i--)
+ if (vp9_convert_qindex_to_q(i) <= target_q)
+ return i;
}
- return ret_val;
+ return new_maxqi;
}
void vp9_second_pass(VP9_COMP *cpi) {
@@ -2111,9 +2035,8 @@
double this_frame_intra_error;
double this_frame_coded_error;
- if (!cpi->twopass.stats_in) {
+ if (!cpi->twopass.stats_in)
return;
- }
vp9_clear_system_state();
@@ -2123,12 +2046,8 @@
// Set a cq_level in constrained quality mode.
if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- int est_cq;
-
- est_cq =
- estimate_cq(cpi,
- &cpi->twopass.total_left_stats,
- (int)(cpi->twopass.bits_left / frames_left));
+ int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
+ (int)(cpi->twopass.bits_left / frames_left));
cpi->cq_target_quality = cpi->oxcf.cq_level;
if (est_cq > cpi->cq_target_quality)
@@ -2139,14 +2058,12 @@
cpi->twopass.maxq_max_limit = cpi->worst_quality;
cpi->twopass.maxq_min_limit = cpi->best_quality;
- tmp_q = estimate_max_q(
- cpi,
- &cpi->twopass.total_left_stats,
- (int)(cpi->twopass.bits_left / frames_left));
+ tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+ (int)(cpi->twopass.bits_left / frames_left));
- cpi->active_worst_quality = tmp_q;
- cpi->ni_av_qi = tmp_q;
- cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
+ cpi->active_worst_quality = tmp_q;
+ cpi->ni_av_qi = tmp_q;
+ cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
#ifndef ONE_SHOT_Q_ESTIMATE
// Limit the maxq value returned subsequently.
@@ -2404,9 +2321,9 @@
if (cpi->oxcf.auto_key
&& lookup_next_frame_stats(cpi, &next_frame) != EOF) {
// Normal scene cut check
- if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
+ if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
break;
- }
+
// How fast is prediction quality decaying
loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
@@ -2416,19 +2333,14 @@
// quality since the last GF or KF.
recent_loop_decay[i % 8] = loop_decay_rate;
decay_accumulator = 1.0;
- for (j = 0; j < 8; j++) {
- decay_accumulator = decay_accumulator * recent_loop_decay[j];
- }
+ for (j = 0; j < 8; j++)
+ decay_accumulator *= recent_loop_decay[j];
// Special check for transition or high motion followed by a
// to a static scene.
- if (detect_transition_to_still(cpi, i,
- (cpi->key_frame_frequency - i),
- loop_decay_rate,
- decay_accumulator)) {
+ if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
+ loop_decay_rate, decay_accumulator))
break;
- }
-
// Step on to the next frame
cpi->twopass.frames_to_key++;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 74caba5..aff5637 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -413,6 +413,201 @@
return besterr;
}
+
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+ vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+ z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred, int w, int h) {
+ uint8_t *z = x->plane[0].src.buf;
+ int src_stride = x->plane[0].src.stride;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int rr, rc, br, bc, hstep;
+ int tr, tc;
+ unsigned int besterr = INT_MAX;
+ unsigned int left, right, up, down, diag;
+ unsigned int sse;
+ unsigned int whichdir;
+ unsigned int halfiters = 4;
+ unsigned int quarteriters = 4;
+ unsigned int eighthiters = 4;
+ int thismse;
+ int maxc, minc, maxr, minr;
+ int y_stride;
+ int offset;
+ int usehp = xd->allow_high_precision_mv;
+
+ uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+ uint8_t *y = xd->plane[0].pre[0].buf +
+ (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+ bestmv->as_mv.col;
+
+ y_stride = xd->plane[0].pre[0].stride;
+
+ rr = ref_mv->as_mv.row;
+ rc = ref_mv->as_mv.col;
+ br = bestmv->as_mv.row << 3;
+ bc = bestmv->as_mv.col << 3;
+ hstep = 4;
+ minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+ ((1 << MV_MAX_BITS) - 1));
+ maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+ ((1 << MV_MAX_BITS) - 1));
+ minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+ ((1 << MV_MAX_BITS) - 1));
+ maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+ ((1 << MV_MAX_BITS) - 1));
+
+ tr = br;
+ tc = bc;
+
+
+ offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+ // central mv
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+
+ // calculate central point error
+ // TODO(yunqingwang): central pointer error was already calculated in full-
+ // pixel search, and can be passed in this function.
+ comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+ error_per_bit, xd->allow_high_precision_mv);
+
+ // Each subsequent iteration checks at least one point in
+ // common with the last iteration could be 2 ( if diag selected)
+ while (--halfiters) {
+ // 1/2 pel
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ hstep >>= 1;
+ while (--quarteriters) {
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ if (xd->allow_high_precision_mv) {
+ usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ } else {
+ usehp = 0;
+ }
+
+ if (usehp) {
+ hstep >>= 1;
+ while (--eighthiters) {
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+ }
+ bestmv->as_mv.row = br;
+ bestmv->as_mv.col = bc;
+
+ vpx_free(comp_pred);
+
+ if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
+
+ return besterr;
+}
+#endif // CONFIG_COMP_INTER_JOINT_SEARCH
+
#undef MVC
#undef PRE
#undef DIST
@@ -2132,7 +2327,109 @@
return INT_MAX;
}
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+ int_mv *ref_mv, int error_per_bit,
+ int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+ int *mvjcost, int *mvcost[2], int_mv *center_mv,
+ const uint8_t *second_pred, int w, int h) {
+ const MACROBLOCKD* const xd = &x->e_mbd;
+ MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+ {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+ int i, j;
+ int this_row_offset, this_col_offset;
+ int what_stride = x->plane[0].src.stride;
+ int in_what_stride = xd->plane[0].pre[0].stride;
+ uint8_t *what = x->plane[0].src.buf;
+ uint8_t *best_address = xd->plane[0].pre[0].buf +
+ (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+ ref_mv->as_mv.col;
+ uint8_t *check_here;
+ unsigned int thissad;
+ int_mv this_mv;
+ unsigned int bestsad = INT_MAX;
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ /* Compound pred buffer */
+ uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ /* Get compound pred by averaging two pred blocks. */
+ comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+ bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+ mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+
+ for (j = 0; j < 8; j++) {
+ this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+ this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
+ check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+ best_address;
+
+ /* Get compound block and use it to calculate SAD. */
+ comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+ in_what_stride);
+ thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+ mvsadcost, error_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ ref_mv->as_mv.row += neighbors[best_site].row;
+ ref_mv->as_mv.col += neighbors[best_site].col;
+ best_address += (neighbors[best_site].row) * in_what_stride +
+ neighbors[best_site].col;
+ }
+ }
+
+ this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+ this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+ if (bestsad < INT_MAX) {
+ int besterr;
+ comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+ besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+ xd->allow_high_precision_mv);
+ vpx_free(comp_pred);
+ return besterr;
+ } else {
+ vpx_free(comp_pred);
+ return INT_MAX;
+ }
+}
+#endif // CONFIG_COMP_INTER_JOINT_SEARCH
#ifdef ENTROPY_STATS
void print_mode_context(VP9_COMMON *pc) {
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index e1ba7fd..cdbd29a 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,5 +79,21 @@
int *mvjcost, int *mvcost[2],
int_mv *center_mv);
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h);
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+ int_mv *ref_mv, int error_per_bit,
+ int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+ int *mvjcost, int *mvcost[2],
+ int_mv *center_mv, const uint8_t *second_pred,
+ int w, int h);
+#endif // CONFIG_COMP_INTER_JOINT_SEARCH
#endif // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e55f555..464b649 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1527,10 +1527,11 @@
for (i = 0; i < MAX_MODES; i++)
cpi->rd_thresh_mult[i] = 128;
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
cpi->fn_ptr[BT].sdf = SDF; \
cpi->fn_ptr[BT].vf = VF; \
cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
cpi->fn_ptr[BT].svf_halfpix_h = SVFHH; \
cpi->fn_ptr[BT].svf_halfpix_v = SVFHV; \
cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@@ -1539,57 +1540,64 @@
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
- NULL, NULL,
+ vp9_sub_pixel_avg_variance32x16, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x16x4d)
BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
- NULL, NULL,
+ vp9_sub_pixel_avg_variance16x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad16x32x4d)
BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
- NULL, NULL,
+ vp9_sub_pixel_avg_variance64x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad64x32x4d)
BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
- NULL, NULL,
+ vp9_sub_pixel_avg_variance32x64, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x64x4d)
BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
- vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+ vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+ vp9_variance_halfpixvar32x32_v,
vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
vp9_sad32x32x4d)
BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
- vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+ vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+ vp9_variance_halfpixvar64x64_v,
vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
vp9_sad64x64x4d)
BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
- vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
- vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
- vp9_sad16x16x4d)
+ vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+ vp9_variance_halfpixvar16x16_v,
+ vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+ vp9_sad16x16x4d)
BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
- NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+ vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+ vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
- NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+ vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+ vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
- NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+ vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+ vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL)
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL)
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
- NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+ vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+ vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
cpi->full_search_sad = vp9_full_search_sad;
cpi->diamond_search_sad = vp9_diamond_search_sad;
@@ -2120,49 +2128,31 @@
const int in_h = src_fb->y_crop_height;
const int out_w = dst_fb->y_crop_width;
const int out_h = dst_fb->y_crop_height;
- int x, y;
+ int x, y, i;
+
+ uint8_t *srcs[3] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer};
+ int src_strides[3] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride};
+
+ uint8_t *dsts[3] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer};
+ int dst_strides[3] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride};
for (y = 0; y < out_h; y += 16) {
for (x = 0; x < out_w; x += 16) {
- int x_q4 = x * 16 * in_w / out_w;
- int y_q4 = y * 16 * in_h / out_h;
- uint8_t *src = src_fb->y_buffer + y * in_h / out_h * src_fb->y_stride +
- x * in_w / out_w;
- uint8_t *dst = dst_fb->y_buffer + y * dst_fb->y_stride + x;
- int src_stride = src_fb->y_stride;
- int dst_stride = dst_fb->y_stride;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ const int factor = i == 0 ? 1 : 2;
+ const int x_q4 = x * (16 / factor) * in_w / out_w;
+ const int y_q4 = y * (16 / factor) * in_h / out_h;
+ const int src_stride = src_strides[i];
+ const int dst_stride = dst_strides[i];
+ uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +
+ x / factor * in_w / out_w;
+ uint8_t *dst = dsts[i] + y * dst_stride + x;
- vp9_convolve8(src, src_stride, dst, dst_stride,
- vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
- vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
- 16, 16);
-
- x_q4 >>= 1;
- y_q4 >>= 1;
- src_stride = src_fb->uv_stride;
- dst_stride = dst_fb->uv_stride;
-
- src = src_fb->u_buffer +
- y / 2 * in_h / out_h * src_fb->uv_stride +
- x / 2 * in_w / out_w;
- dst = dst_fb->u_buffer +
- y / 2 * dst_fb->uv_stride +
- x / 2;
- vp9_convolve8(src, src_stride, dst, dst_stride,
- vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
- vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
- 8, 8);
-
- src = src_fb->v_buffer +
- y / 2 * in_h / out_h * src_fb->uv_stride +
- x / 2 * in_w / out_w;
- dst = dst_fb->v_buffer +
- y / 2 * dst_fb->uv_stride +
- x / 2;
- vp9_convolve8(src, src_stride, dst, dst_stride,
- vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
- vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
- 8, 8);
+ vp9_convolve8(src, src_stride, dst, dst_stride,
+ vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+ vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+ 16 / factor, 16 / factor);
+ }
}
}
@@ -2873,7 +2863,7 @@
break;
}
- vp9_denoise(cpi->Source, cpi->Source, l, 1, 0);
+ vp9_denoise(cpi->Source, cpi->Source, l);
}
#endif
@@ -3870,16 +3860,8 @@
VP9BORDERINPIXELS);
// Calculate scaling factors for each of the 3 available references
- for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
- if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) {
- memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i]));
- } else {
- YV12_BUFFER_CONFIG *fb = &cm->yv12_fb[cm->active_ref_idx[i]];
- vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],
- fb->y_crop_width, fb->y_crop_height,
- cm->width, cm->height);
- }
- }
+ for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+ vp9_setup_scale_factors(cm, i);
vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
@@ -3958,7 +3940,7 @@
double weight = 0;
#if CONFIG_POSTPROC
vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
- cm->filter_level * 10 / 6, 1, 0);
+ cm->filter_level * 10 / 6);
#endif
vp9_clear_system_state();
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1b143f5..271a63f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -272,6 +272,7 @@
[ENTROPY_NODES];
int seg_eob, default_eob;
uint8_t token_cache[1024];
+ const uint8_t * band_translate;
// Check for consistency of tx_size with mode info
assert((!type && !plane) || (type && plane));
@@ -291,6 +292,7 @@
coef_probs = cm->fc.coef_probs_4x4;
seg_eob = 16;
scan = get_scan_4x4(tx_type);
+ band_translate = vp9_coefband_trans_4x4;
break;
}
case TX_8X8: {
@@ -304,6 +306,7 @@
scan = get_scan_8x8(tx_type);
coef_probs = cm->fc.coef_probs_8x8;
seg_eob = 64;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_16X16: {
@@ -317,6 +320,7 @@
seg_eob = 256;
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_32X32:
@@ -325,6 +329,7 @@
seg_eob = 1024;
above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
default:
abort();
@@ -347,7 +352,7 @@
for (c = 0; c < eob; c++) {
int v = qcoeff_ptr[scan[c]];
int t = vp9_dct_value_tokens_ptr[v].token;
- int band = get_coef_band(scan, tx_size, c);
+ int band = get_coef_band(band_translate, c);
if (c)
pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
@@ -361,7 +366,7 @@
if (c)
pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
cost += mb->token_costs[tx_size][type][ref]
- [get_coef_band(scan, tx_size, c)]
+ [get_coef_band(band_translate, c)]
[pt][DCT_EOB_TOKEN];
}
}
@@ -684,7 +689,11 @@
int *Distortion, int64_t best_rd) {
int i;
MACROBLOCKD *const xd = &mb->e_mbd;
+#if CONFIG_AB4X4
+ int cost = 0;
+#else
int cost = mb->mbmode_cost[xd->frame_type][I4X4_PRED];
+#endif
int distortion = 0;
int tot_rate_y = 0;
int64_t total_rd = 0;
@@ -714,7 +723,6 @@
total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
t_above + x_idx, t_left + y_idx,
&r, &ry, &d);
-
cost += r;
distortion += d;
tot_rate_y += ry;
@@ -748,6 +756,13 @@
TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
int i;
+#if CONFIG_AB4X4
+ if (bsize < BLOCK_SIZE_SB8X8) {
+ x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+ return best_rd;
+ }
+#endif
+
for (i = 0; i < NB_TXFM_MODES; i++)
txfm_cache[i] = INT64_MAX;
@@ -1069,9 +1084,7 @@
B_PREDICTION_MODE modes[4];
int_mv mvs[4], second_mvs[4];
int eobs[4];
-
int mvthresh;
- int *mdcounts;
} BEST_SEG_INFO;
static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
@@ -1322,7 +1335,6 @@
int_mv *best_ref_mv,
int_mv *second_best_ref_mv,
int64_t best_rd,
- int *mdcounts,
int *returntotrate,
int *returnyrate,
int *returndistortion,
@@ -1339,7 +1351,6 @@
bsi.second_ref_mv = second_best_ref_mv;
bsi.mvp.as_int = best_ref_mv->as_int;
bsi.mvthresh = mvthresh;
- bsi.mdcounts = mdcounts;
for (i = 0; i < 4; i++)
bsi.modes[i] = ZERO4X4;
@@ -1612,7 +1623,6 @@
int mi_row, int mi_col,
int_mv frame_nearest_mv[MAX_REF_FRAMES],
int_mv frame_near_mv[MAX_REF_FRAMES],
- int frame_mdcounts[4][4],
struct buf_2d yv12_mb[4][MAX_MB_PLANE],
struct scale_factors scale[MAX_REF_FRAMES]) {
VP9_COMMON *cm = &cpi->common;
@@ -1797,7 +1807,7 @@
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize,
- int mdcounts[4], int64_t txfm_cache[],
+ int64_t txfm_cache[],
int *rate2, int *distortion, int *skippable,
int *compmode_cost,
int *rate_y, int *distortion_y,
@@ -1807,8 +1817,9 @@
INTERPOLATIONFILTERTYPE *best_filter,
int_mv frame_mv[MB_MODE_COUNT]
[MAX_REF_FRAMES],
- YV12_BUFFER_CONFIG *scaled_ref_frame,
- int mi_row, int mi_col) {
+ YV12_BUFFER_CONFIG **scaled_ref_frame,
+ int mi_row, int mi_col,
+ int_mv single_newmv[MAX_REF_FRAMES]) {
const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
VP9_COMMON *cm = &cpi->common;
@@ -1838,6 +1849,158 @@
ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
if (is_comp_pred) {
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+ const int b_sz[BLOCK_SIZE_TYPES][2] = {
+ {4, 4},
+ {8, 8},
+ {8, 16},
+ {16, 8},
+ {16, 16},
+ {16, 32},
+ {32, 16},
+ {32, 32},
+ {32, 64},
+ {64, 32},
+ {64, 64}
+ };
+
+ int ite;
+ // Prediction buffer from second frame.
+ uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
+ b_sz[bsize][1] * sizeof(uint8_t));
+
+ // Do joint motion search in compound mode to get more accurate mv.
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+ struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+ struct buf_2d scaled_first_yv12;
+ int last_besterr[2] = {INT_MAX, INT_MAX};
+
+ if (scaled_ref_frame[0]) {
+ int i;
+
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[i] = xd->plane[i].pre[0];
+
+ setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+ NULL, NULL);
+ }
+
+ if (scaled_ref_frame[1]) {
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_second_yv12[i] = xd->plane[i].pre[1];
+
+ setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+ NULL, NULL);
+ }
+ xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+ mi_row, mi_col);
+ xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+ mi_row, mi_col);
+
+ scaled_first_yv12 = xd->plane[0].pre[0];
+
+ // Initialize mv using single prediction mode result.
+ frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+ frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+ // Allow joint search multiple times iteratively for each ref frame, and
+ // break out the search loop if it couldn't find better mv.
+ for (ite = 0; ite < 4; ite++) {
+ struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
+ xd->plane[0].pre[1]};
+ int bestsme = INT_MAX;
+ int sadpb = x->sadperbit16;
+ int_mv tmp_mv;
+ int search_range = 3;
+
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+ int id = ite % 2;
+
+ // Get pred block from second frame.
+ vp9_build_inter_predictor(ref_yv12[!id].buf,
+ ref_yv12[!id].stride,
+ second_pred, b_sz[bsize][0],
+ &frame_mv[NEWMV][refs[!id]],
+ &xd->scale_factor[!id],
+ b_sz[bsize][0], b_sz[bsize][1], 0,
+ &xd->subpix);
+
+ // Compound motion search on first ref frame.
+ if (id)
+ xd->plane[0].pre[0] = ref_yv12[id];
+ vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+ // Use mv result from single mode as mvp.
+ tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+ tmp_mv.as_mv.col >>= 3;
+ tmp_mv.as_mv.row >>= 3;
+
+ // Small-range full-pixel motion search
+ bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+ search_range,
+ &cpi->fn_ptr[block_size],
+ x->nmvjointcost, x->mvcost,
+ &ref_mv[id], second_pred,
+ b_sz[bsize][0], b_sz[bsize][1]);
+
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ if (bestsme < INT_MAX) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+
+ bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+ &ref_mv[id],
+ x->errorperbit,
+ &cpi->fn_ptr[block_size],
+ x->nmvjointcost, x->mvcost,
+ &dis, &sse, second_pred,
+ b_sz[bsize][0],
+ b_sz[bsize][1]);
+ }
+
+ if (id)
+ xd->plane[0].pre[0] = scaled_first_yv12;
+
+ if (bestsme < last_besterr[id]) {
+ frame_mv[NEWMV][refs[id]].as_int =
+ xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+ last_besterr[id] = bestsme;
+ } else {
+ break;
+ }
+ }
+
+ // restore the predictor
+ if (scaled_ref_frame[0]) {
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[0] = backup_yv12[i];
+ }
+
+ if (scaled_ref_frame[1]) {
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[1] = backup_second_yv12[i];
+ }
+
+ vpx_free(second_pred);
+#endif // CONFIG_COMP_INTER_JOINT_SEARCH
+
if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
return INT64_MAX;
@@ -1862,7 +2025,7 @@
int tmp_row_min = x->mv_row_min;
int tmp_row_max = x->mv_row_max;
- if (scaled_ref_frame) {
+ if (scaled_ref_frame[0]) {
int i;
// Swap out the reference frame for a version that's been scaled to
@@ -1871,7 +2034,7 @@
for (i = 0; i < MAX_MB_PLANE; i++)
backup_yv12[i] = xd->plane[i].pre[0];
- setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
+ setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
NULL, NULL);
}
@@ -1914,6 +2077,7 @@
}
frame_mv[NEWMV][refs[0]].as_int =
xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+ single_newmv[refs[0]].as_int = tmp_mv.as_int;
// Add the new motion vector cost to our rolling cost variable
*rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -1921,7 +2085,7 @@
96, xd->allow_high_precision_mv);
// restore the predictor, if required
- if (scaled_ref_frame) {
+ if (scaled_ref_frame[0]) {
int i;
for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2151,8 +2315,14 @@
mode = xd->mode_info_context->mbmi.mode;
txfm_size = xd->mode_info_context->mbmi.txfm_size;
rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
- &dist_uv, &uv_skip, bsize);
+ &dist_uv, &uv_skip,
+ (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+ bsize);
+#if CONFIG_AB4X4
+ if (bsize < BLOCK_SIZE_SB8X8)
+#else
if (bsize == BLOCK_SIZE_SB8X8)
+#endif
err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
&rate4x4_y_tokenonly,
&dist4x4_y, err);
@@ -2165,7 +2335,11 @@
sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
xd->mode_info_context->mbmi.mode = mode;
xd->mode_info_context->mbmi.txfm_size = txfm_size;
+#if CONFIG_AB4X4
+ } else if (bsize < BLOCK_SIZE_SB8X8 && err4x4 < err) {
+#else
} else if (bsize == BLOCK_SIZE_SB8X8 && err4x4 < err) {
+#endif
*returnrate = rate4x4_y + rate_uv +
vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
*returndist = dist4x4_y + (dist_uv >> 2);
@@ -2203,15 +2377,14 @@
unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
int comp_pred, i;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
- int frame_mdcounts[4][4];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ int_mv single_newmv[MAX_REF_FRAMES];
static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
VP9_ALT_FLAG };
int idx_list[4] = {0,
cpi->lst_fb_idx,
cpi->gld_fb_idx,
cpi->alt_fb_idx};
- int mdcounts[4];
int64_t best_rd = INT64_MAX;
int64_t best_txfm_rd[NB_TXFM_MODES];
int64_t best_txfm_diff[NB_TXFM_MODES];
@@ -2251,6 +2424,7 @@
xd->mode_info_context->mbmi.segment_id = segment_id;
estimate_ref_frame_costs(cpi, segment_id, ref_costs);
vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+ vpx_memset(&single_newmv, 0, sizeof(single_newmv));
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = INT64_MAX;
@@ -2293,7 +2467,7 @@
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
- frame_mdcounts, yv12_mb, scale_factor);
+ yv12_mb, scale_factor);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -2307,7 +2481,9 @@
i++) {
mbmi->txfm_size = i;
rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
- &dist_uv[i], &skip_uv[i], bsize);
+ &dist_uv[i], &skip_uv[i],
+ (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+ bsize);
mode_uv[i] = mbmi->uv_mode;
}
}
@@ -2337,6 +2513,7 @@
|| (cpi->ref_frame_flags & flag_list[ref_frame]))) {
continue;
}
+
if (cpi->speed > 0) {
if (!(ref_frame_mask & (1 << ref_frame))) {
continue;
@@ -2383,10 +2560,18 @@
mbmi->interp_filter = cm->mcomp_filter_type;
vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+#if CONFIG_AB4X4
+ if (bsize >= BLOCK_SIZE_SB8X8 &&
+ (this_mode == I4X4_PRED || this_mode == SPLITMV))
+ continue;
+ if (bsize < BLOCK_SIZE_SB8X8 &&
+ !(this_mode == I4X4_PRED || this_mode == SPLITMV))
+ continue;
+#else
if (bsize != BLOCK_SIZE_SB8X8 &&
(this_mode == I4X4_PRED || this_mode == SPLITMV))
continue;
-
+#endif
if (comp_pred) {
if (ref_frame == ALTREF_FRAME) {
@@ -2420,8 +2605,6 @@
xd->plane[i].pre[1] = yv12_mb[second_ref][i];
}
- vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
-
// If the segment reference frame feature is enabled....
// then do nothing if the current ref frame is not allowed..
if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
@@ -2451,7 +2634,6 @@
// Note the rate value returned here includes the cost of coding
// the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
- assert(bsize == BLOCK_SIZE_SB8X8);
mbmi->txfm_size = TX_4X4;
rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
&distortion_y, INT64_MAX);
@@ -2519,7 +2701,7 @@
tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
&mbmi->ref_mvs[mbmi->ref_frame][0],
- second_ref, INT64_MAX, mdcounts,
+ second_ref, INT64_MAX,
&rate, &rate_y, &distortion,
&skippable,
(int)this_rd_thresh, seg_mvs);
@@ -2558,7 +2740,7 @@
// switchable list (bilinear, 6-tap) is indicated at the frame level
tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
&mbmi->ref_mvs[mbmi->ref_frame][0],
- second_ref, INT64_MAX, mdcounts,
+ second_ref, INT64_MAX,
&rate, &rate_y, &distortion,
&skippable,
(int)this_rd_thresh, seg_mvs);
@@ -2589,10 +2771,10 @@
// If even the 'Y' rd value of split is higher than best so far
// then dont bother looking at UV
vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
- bsize);
- vp9_subtract_sbuv(x, bsize);
+ BLOCK_SIZE_SB8X8);
+ vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
- &uv_skippable, bsize, TX_4X4);
+ &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
rate2 += rate_uv;
distortion2 += distortion_uv;
skippable = skippable && uv_skippable;
@@ -2608,7 +2790,7 @@
vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
mbmi->mode = this_mode;
} else {
- YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+ YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
int fb;
if (mbmi->ref_frame == LAST_FRAME) {
@@ -2620,17 +2802,31 @@
}
if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
- scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+ scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
+ if (comp_pred) {
+ if (mbmi->second_ref_frame == LAST_FRAME) {
+ fb = cpi->lst_fb_idx;
+ } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
+ fb = cpi->gld_fb_idx;
+ } else {
+ fb = cpi->alt_fb_idx;
+ }
+
+ if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+ scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+ }
this_rd = handle_inter_mode(cpi, x, bsize,
- mdcounts, txfm_cache,
+ txfm_cache,
&rate2, &distortion2, &skippable,
&compmode_cost,
&rate_y, &distortion_y,
&rate_uv, &distortion_uv,
&mode_excluded, &disable_skip,
mode_index, &tmp_best_filter, frame_mv,
- scaled_ref_frame, mi_row, mi_col);
+ scaled_ref_frame, mi_row, mi_col,
+ single_newmv);
if (this_rd == INT64_MAX)
continue;
}
@@ -2833,7 +3029,13 @@
}
}
-
+#if CONFIG_AB4X4
+ if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) {
+ *returnrate = INT_MAX;
+ *returndistortion = INT_MAX;
+ return best_rd;
+ }
+#endif
assert((cm->mcomp_filter_type == SWITCHABLE) ||
(cm->mcomp_filter_type == best_mbmode.interp_filter) ||
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index cb670da..50d849d 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -119,7 +119,12 @@
TOKENEXTRA *t = *tp; /* store tokens starting here */
const int eob = xd->plane[plane].eobs[block];
const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+#if CONFIG_AB4X4
+ const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ?
+ BLOCK_SIZE_SB8X8 : mbmi->sb_type;
+#else
const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+#endif
const int bwl = b_width_log2(sb_type);
const int off = block >> (2 * tx_size);
const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
@@ -136,6 +141,7 @@
ENTROPY_CONTEXT above_ec, left_ec;
uint8_t token_cache[1024];
TX_TYPE tx_type = DCT_DCT;
+ const uint8_t * band_translate;
assert((!type && !plane) || (type && plane));
switch (tx_size) {
@@ -149,6 +155,7 @@
scan = get_scan_4x4(tx_type);
counts = cpi->coef_counts_4x4;
coef_probs = cpi->common.fc.coef_probs_4x4;
+ band_translate = vp9_coefband_trans_4x4;
break;
}
case TX_8X8: {
@@ -162,6 +169,7 @@
scan = get_scan_8x8(tx_type);
counts = cpi->coef_counts_8x8;
coef_probs = cpi->common.fc.coef_probs_8x8;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_16X16: {
@@ -175,6 +183,7 @@
scan = get_scan_16x16(tx_type);
counts = cpi->coef_counts_16x16;
coef_probs = cpi->common.fc.coef_probs_16x16;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_32X32:
@@ -184,6 +193,7 @@
scan = vp9_default_zig_zag1d_32x32;
counts = cpi->coef_counts_32x32;
coef_probs = cpi->common.fc.coef_probs_32x32;
+ band_translate = vp9_coefband_trans_8x8plus;
break;
}
@@ -196,7 +206,7 @@
c = 0;
do {
- const int band = get_coef_band(scan, tx_size, c);
+ const int band = get_coef_band(band_translate, c);
int token;
int v = 0;
rc = scan[c];
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 13dabbd..306476b 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -12,6 +12,7 @@
#define VP9_ENCODER_VP9_VARIANCE_H_
#include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int source_stride,
@@ -50,6 +51,15 @@
int Refstride,
unsigned int *sse);
+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+ int source_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *ref_ptr,
+ int Refstride,
+ unsigned int *sse,
+ const uint8_t *second_pred);
+
typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
int rp, unsigned long *sum_s,
unsigned long *sum_r, unsigned long *sum_sq_s,
@@ -64,15 +74,33 @@
int ref_stride);
typedef struct vp9_variance_vtable {
- vp9_sad_fn_t sdf;
- vp9_variance_fn_t vf;
- vp9_subpixvariance_fn_t svf;
- vp9_variance_fn_t svf_halfpix_h;
- vp9_variance_fn_t svf_halfpix_v;
- vp9_variance_fn_t svf_halfpix_hv;
- vp9_sad_multi_fn_t sdx3f;
- vp9_sad_multi1_fn_t sdx8f;
- vp9_sad_multi_d_fn_t sdx4df;
+ vp9_sad_fn_t sdf;
+ vp9_variance_fn_t vf;
+ vp9_subpixvariance_fn_t svf;
+ vp9_subp_avg_variance_fn_t svaf;
+ vp9_variance_fn_t svf_halfpix_h;
+ vp9_variance_fn_t svf_halfpix_v;
+ vp9_variance_fn_t svf_halfpix_hv;
+ vp9_sad_multi_fn_t sdx3f;
+ vp9_sad_multi1_fn_t sdx8f;
+ vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
+// #if CONFIG_COMP_INTER_JOINT_SEARCH
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+ int height, uint8_t *ref, int ref_stride) {
+ int i, j;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < weight; j++) {
+ int tmp;
+ tmp = pred[j] + ref[j];
+ comp_pred[j] = (tmp + 1) >> 1;
+ }
+ comp_pred += weight;
+ pred += weight;
+ ref += ref_stride;
+ }
+}
+// #endif // CONFIG_COMP_INTER_JOINT_SEARCH
#endif // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index c2a6004..fa53abd 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -13,6 +13,7 @@
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_subpelvar.h"
#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
unsigned int i, sum = 0;
@@ -58,6 +59,29 @@
return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint8_t temp2[68 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 33, 64, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+ comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+ return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -92,6 +116,29 @@
return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint8_t temp2[68 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 65, 32, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+ comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+ return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -126,6 +173,29 @@
return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint8_t temp2[36 * 32];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 17, 32, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+ comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+ return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -160,6 +230,29 @@
return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint8_t temp2[36 * 32];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 33, 16, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+ comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+ return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -317,6 +410,31 @@
return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint8_t temp2[20 * 16];
+ const int16_t *hfilter, *vfilter;
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4); // compound pred buffer
+ uint16_t fdata3[5 * 4]; // Temp data bufffer used in filtering
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ // First filter 1d Horizontal
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 5, 4, hfilter);
+
+ // Now filter Verticaly
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, vfilter);
+ comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+ return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
int src_pixels_per_line,
@@ -339,6 +457,29 @@
return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[9 * 8]; // Temp data bufffer used in filtering
+ uint8_t temp2[20 * 16];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 9, 8, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+ comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+ return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
@@ -360,6 +501,30 @@
return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[17 * 16];
+ uint8_t temp2[20 * 16];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 17, 16, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+ comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+ return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
@@ -381,6 +546,29 @@
return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint8_t temp2[68 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 65, 64, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+ comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+ return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
@@ -402,6 +590,29 @@
return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint8_t temp2[36 * 32];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 33, 32, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+ comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+ return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -543,6 +754,29 @@
return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[16 * 9]; // Temp data bufffer used in filtering
+ uint8_t temp2[20 * 16];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 9, 16, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+ comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+ return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
@@ -564,3 +798,25 @@
return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[9 * 16]; // Temp data bufffer used in filtering
+ uint8_t temp2[20 * 16];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 17, 8, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+ comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+ return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 45609da..9326165 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -543,31 +543,6 @@
return VPX_CODEC_OK;
}
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
- YV12_BUFFER_CONFIG *yv12) {
- vpx_codec_err_t res = VPX_CODEC_OK;
- yv12->y_buffer = img->planes[VPX_PLANE_Y];
- yv12->u_buffer = img->planes[VPX_PLANE_U];
- yv12->v_buffer = img->planes[VPX_PLANE_V];
-
- yv12->y_crop_width = img->d_w;
- yv12->y_crop_height = img->d_h;
- yv12->y_width = img->d_w;
- yv12->y_height = img->d_h;
-
- yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2
- : yv12->y_width;
- yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
- : yv12->y_height;
-
- yv12->y_stride = img->stride[VPX_PLANE_Y];
- yv12->uv_stride = img->stride[VPX_PLANE_U];
-
- yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
- yv12->clrtype = REG_YUV;
- return res;
-}
-
static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
unsigned long duration,
unsigned long deadline) {
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 85022c9..811cea7 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -578,30 +578,6 @@
return res;
}
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
- YV12_BUFFER_CONFIG *yv12) {
- vpx_codec_err_t res = VPX_CODEC_OK;
- yv12->y_buffer = img->planes[VPX_PLANE_Y];
- yv12->u_buffer = img->planes[VPX_PLANE_U];
- yv12->v_buffer = img->planes[VPX_PLANE_V];
-
- yv12->y_crop_width = img->d_w;
- yv12->y_crop_height = img->d_h;
- yv12->y_width = img->d_w;
- yv12->y_height = img->d_h;
- yv12->uv_width = yv12->y_width / 2;
- yv12->uv_height = yv12->y_height / 2;
-
- yv12->y_stride = img->stride[VPX_PLANE_Y];
- yv12->uv_stride = img->stride[VPX_PLANE_U];
-
- yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
- yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||
- img->fmt == VPX_IMG_FMT_VPXYV12);
-
- return res;
-}
-
static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
int ctr_id,
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 96de5f5..84b4d39 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -37,11 +37,11 @@
img->planes[VPX_PLANE_Y] = yv12->y_buffer;
img->planes[VPX_PLANE_U] = yv12->u_buffer;
img->planes[VPX_PLANE_V] = yv12->v_buffer;
- img->planes[VPX_PLANE_ALPHA] = NULL;
+ img->planes[VPX_PLANE_ALPHA] = yv12->alpha_buffer;
img->stride[VPX_PLANE_Y] = yv12->y_stride;
img->stride[VPX_PLANE_U] = yv12->uv_stride;
img->stride[VPX_PLANE_V] = yv12->uv_stride;
- img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+ img->stride[VPX_PLANE_ALPHA] = yv12->alpha_stride;
img->bps = bps;
img->user_priv = user_priv;
img->img_data = yv12->buffer_alloc;
@@ -49,4 +49,34 @@
img->self_allocd = 0;
}
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+ YV12_BUFFER_CONFIG *yv12) {
+ yv12->y_buffer = img->planes[VPX_PLANE_Y];
+ yv12->u_buffer = img->planes[VPX_PLANE_U];
+ yv12->v_buffer = img->planes[VPX_PLANE_V];
+ yv12->alpha_buffer = img->planes[VPX_PLANE_ALPHA];
+
+ yv12->y_crop_width = img->d_w;
+ yv12->y_crop_height = img->d_h;
+ yv12->y_width = img->d_w;
+ yv12->y_height = img->d_h;
+
+ yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2
+ : yv12->y_width;
+ yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
+ : yv12->y_height;
+
+ yv12->alpha_width = yv12->alpha_buffer ? img->d_w : 0;
+ yv12->alpha_height = yv12->alpha_buffer ? img->d_h : 0;
+
+ yv12->y_stride = img->stride[VPX_PLANE_Y];
+ yv12->uv_stride = img->stride[VPX_PLANE_U];
+ yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0;
+
+ yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+ yv12->clrtype = REG_YUV;
+
+ return VPX_CODEC_OK;
+}
+
#endif // VP9_VP9_IFACE_COMMON_H_
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index cd66f00..99e3543 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -76,12 +76,17 @@
ybf->uv_height = uv_height;
ybf->uv_stride = uv_stride;
+ ybf->alpha_width = 0;
+ ybf->alpha_height = 0;
+ ybf->alpha_stride = 0;
+
ybf->border = border;
ybf->frame_size = frame_size;
ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * uv_stride) + border / 2;
ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * uv_stride) + border / 2;
+ ybf->alpha_buffer = NULL;
ybf->corrupted = 0; /* assume not currupted by errors */
return 0;
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 85396c0..7b8bd85 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -52,9 +52,14 @@
int uv_stride;
/* int uvinternal_width; */
+ int alpha_width;
+ int alpha_height;
+ int alpha_stride;
+
uint8_t *y_buffer;
uint8_t *u_buffer;
uint8_t *v_buffer;
+ uint8_t *alpha_buffer;
uint8_t *buffer_alloc;
int buffer_alloc_sz;