Implement av1_txb_init_levels_sse4_1
1. Add sse4_1 version of av1_txb_init_levels.
2. The unit test shows it is 1.7x ~ 8.1x faster
than C version.
3. For encoder, about 1.7% faster shows by
encoding 10 frames of foreman_cif.y4m.
a) gcc (Ubuntu 7.2.0-8ubuntu3.2) 7.2.0
b) CPU: Intel(R) Core(TM) i7-6900K CPU @ 3.20GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=1
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=800 --limit=10
Change-Id: I0c2502c3efb39b0197da18aabb4e8255929fcb32
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 74b2fc4..c9f5904 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -68,7 +68,7 @@
av1_free_txb_buf(cpi);
// TODO(jingning): This should be further reduced.
CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
- aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
+ aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
}
void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
@@ -425,9 +425,8 @@
qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
}
-static INLINE void av1_txb_init_levels(const tran_low_t *const coeff,
- const int width, const int height,
- uint8_t *const levels) {
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
const int stride = width + TX_PAD_HOR;
uint8_t *ls = levels;