Optimize highbd 4x16 and 16x4 inv_txfm
Enabled sse4_1 optimizations for tx_sizes 4x16 and 16x4.
Module level gains:
Tx_size Gain w.r.t. C
4x16 6.01x
16x4 7.36x
When tested for 20 frames of crowd_run_360p_10 at 1 mbps
for speed=1 preset, observed ~0.5% reduction in encoder time.
Change-Id: I0c5d6530c666150be0de062a7084c7a2bf61410f
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index a6d5138..033eeb7 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -135,6 +135,10 @@
specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
add_proto qw/void av1_highbd_inv_txfm_add_8x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/;
add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";