Blame - av1/common/x86/highbd_warp_plane_sse4.c - aom

2018-05-09 16:42:19 +0530

[diff] [blame]

21

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

22

static const uint8_t highbd_shuffle_alpha0_mask0[16] = {

23

0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3

24

};

25

static const uint8_t highbd_shuffle_alpha0_mask1[16] = {

26

4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7

27

};

Hien Ho

830b897

2019-04-04 15:51:14 -0700

[diff] [blame]

28

static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9,

29

10, 11, 8, 9, 10, 11,

30

8, 9, 10, 11 };

31

static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,

32

14, 15, 12, 13, 14, 15,

33

12, 13, 14, 15 };

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

34

35

static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,

36

__m128i *coeff) {

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

37

// Filter even-index pixels

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

38

const __m128i tmp_0 =

39

_mm_loadu_si128((__m128i *)(av1_warped_filter +

40

((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));

41

const __m128i tmp_2 =

42

_mm_loadu_si128((__m128i *)(av1_warped_filter +

43

((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));

44

const __m128i tmp_4 =

45

_mm_loadu_si128((__m128i *)(av1_warped_filter +

46

((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));

47

const __m128i tmp_6 =

48

_mm_loadu_si128((__m128i *)(av1_warped_filter +

49

((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

50

51

// coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2

52

const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);

53

// coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6

54

const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);

55

// coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2

56

const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);

57

// coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6

58

const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);

59

60

// coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

61

coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

62

// coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

63

coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

64

// coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

65

coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

66

// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

67

coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

68

69

// Filter odd-index pixels

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

70

const __m128i tmp_1 =

71

_mm_loadu_si128((__m128i *)(av1_warped_filter +

72

((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));

73

const __m128i tmp_3 =

74

_mm_loadu_si128((__m128i *)(av1_warped_filter +

75

((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));

76

const __m128i tmp_5 =

77

_mm_loadu_si128((__m128i *)(av1_warped_filter +

78

((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));

79

const __m128i tmp_7 =

80

_mm_loadu_si128((__m128i *)(av1_warped_filter +

81

((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

82

83

const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);

84

const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);

85

const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);

86

const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);

87

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

88

coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);

89

coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);

90

coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);

91

coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);

92

}

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

93

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

94

static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(

95

int sx, __m128i *coeff) {

96

// Filter coeff

97

const __m128i tmp_0 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

98

(__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

99

100

coeff[0] = _mm_shuffle_epi8(

101

tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));

102

coeff[2] = _mm_shuffle_epi8(

103

tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));

104

coeff[4] = _mm_shuffle_epi8(

105

tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));

106

coeff[6] = _mm_shuffle_epi8(

107

tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));

coeff[1] = coeff[0];

coeff[3] = coeff[2];

coeff[5] = coeff[4];

coeff[7] = coeff[6];

}

static INLINE void highbd_filter_src_pixels(

116

const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,

117

const int offset_bits_horiz, const int reduce_bits_horiz, int k) {

118

const __m128i src_1 = *src;

119

const __m128i src2_1 = *src2;

120

121

const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +

122

((1 << reduce_bits_horiz) >> 1));

123

124

const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);

125

const __m128i res_2 =

126

_mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);

127

const __m128i res_4 =

128

_mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);

129

const __m128i res_6 =

130

_mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);

131

132

__m128i res_even =

133

_mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));

134

res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),

135

_mm_cvtsi32_si128(reduce_bits_horiz));

136

137

const __m128i res_1 =

138

_mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);

139

const __m128i res_3 =

140

_mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);

141

const __m128i res_5 =

142

_mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);

143

const __m128i res_7 =

144

_mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

145

146

__m128i res_odd =

147

_mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));

148

res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),

149

_mm_cvtsi32_si128(reduce_bits_horiz));

150

151

// Combine results into one register.

152

// We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7

153

// as this order helps with the vertical filter.

154

tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);

155

}

156

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

157

static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,

158

__m128i *tmp, int sx, int alpha, int k,

159

const int offset_bits_horiz,

160

const int reduce_bits_horiz) {

161

__m128i coeff[8];

162

highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);

163

highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,

164

reduce_bits_horiz, k);

165

}

166

167

static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(

168

const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,

169

int32_t sx4, int alpha, int beta, int p_height, int height, int i,

170

const int offset_bits_horiz, const int reduce_bits_horiz) {

(void)beta;

(void)alpha;

int k;

__m128i coeff[8];

highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);

177

178

for (k = -7; k < AOMMIN(8, p_height - i); ++k) {

int iy = iy4 + k;

if (iy < 0)

iy = 0;

else if (iy > height - 1)

183

iy = height - 1;

184

185

// Load source pixels

186

const __m128i src =

187

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));

188

const __m128i src2 =

189

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));

190

highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,

191

reduce_bits_horiz, k);

}

}

static INLINE void highbd_warp_horizontal_filter_alpha0(

196

const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,

197

int32_t sx4, int alpha, int beta, int p_height, int height, int i,

198

const int offset_bits_horiz, const int reduce_bits_horiz) {

199

(void)alpha;

200

int k;

201

for (k = -7; k < AOMMIN(8, p_height - i); ++k) {

int iy = iy4 + k;

if (iy < 0)

iy = 0;

else if (iy > height - 1)

206

iy = height - 1;

207

int sx = sx4 + beta * (k + 4);

208

209

// Load source pixels

210

const __m128i src =

211

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));

212

const __m128i src2 =

213

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));

214

215

__m128i coeff[8];

216

highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);

217

highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,

218

reduce_bits_horiz, k);

}

}

static INLINE void highbd_warp_horizontal_filter_beta0(

223

const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,

224

int32_t sx4, int alpha, int beta, int p_height, int height, int i,

225

const int offset_bits_horiz, const int reduce_bits_horiz) {

(void)beta;

int k;

__m128i coeff[8];

highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);

230

231

for (k = -7; k < AOMMIN(8, p_height - i); ++k) {

int iy = iy4 + k;

if (iy < 0)

iy = 0;

else if (iy > height - 1)

236

iy = height - 1;

237

238

// Load source pixels

239

const __m128i src =

240

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));

241

const __m128i src2 =

242

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));

243

highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,

244

reduce_bits_horiz, k);

}

}

static INLINE void highbd_warp_horizontal_filter(

249

const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,

250

int32_t sx4, int alpha, int beta, int p_height, int height, int i,

251

const int offset_bits_horiz, const int reduce_bits_horiz) {

252

int k;

253

for (k = -7; k < AOMMIN(8, p_height - i); ++k) {

int iy = iy4 + k;

if (iy < 0)

iy = 0;

else if (iy > height - 1)

258

iy = height - 1;

259

int sx = sx4 + beta * (k + 4);

260

261

// Load source pixels

262

const __m128i src =

263

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));

264

const __m128i src2 =

265

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));

266

267

highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,

reduce_bits_horiz);

}

}

static INLINE void highbd_prepare_warp_horizontal_filter(

273

const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,

274

int32_t sx4, int alpha, int beta, int p_height, int height, int i,

275

const int offset_bits_horiz, const int reduce_bits_horiz) {

276

if (alpha == 0 && beta == 0)

277

highbd_warp_horizontal_filter_alpha0_beta0(

278

ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,

279

offset_bits_horiz, reduce_bits_horiz);

280

281

else if (alpha == 0 && beta != 0)

282

highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,

283

beta, p_height, height, i,

284

offset_bits_horiz, reduce_bits_horiz);

285

286

else if (alpha != 0 && beta == 0)

287

highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,

288

beta, p_height, height, i,

289

offset_bits_horiz, reduce_bits_horiz);

290

else

291

highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,

292

p_height, height, i, offset_bits_horiz,

reduce_bits_horiz);

}

Cheng Chen

2017-11-09 16:19:17 -0800

[diff] [blame]

296

void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,

297

int width, int height, int stride,

298

uint16_t *pred, int p_col, int p_row,

299

int p_width, int p_height, int p_stride,

300

int subsampling_x, int subsampling_y, int bd,

301

ConvolveParams *conv_params, int16_t alpha,

302

int16_t beta, int16_t gamma, int16_t delta) {

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

303

__m128i tmp[15];

304

int i, j, k;

305

const int reduce_bits_horiz =

306

conv_params->round_0 +

307

AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);

308

const int reduce_bits_vert = conv_params->is_compound

309

? conv_params->round_1

310

: 2 * FILTER_BITS - reduce_bits_horiz;

311

const int offset_bits_horiz = bd + FILTER_BITS - 1;

312

assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));

313

assert(!(bd == 12 && reduce_bits_horiz < 5));

Peng Bin

b0f64c5

2018-04-26 15:41:07 +0800

[diff] [blame]

314

assert(IMPLIES(conv_params->do_average, conv_params->is_compound));

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

315

316

const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;

317

const __m128i clip_pixel =

318

_mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));

319

const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);

320

const __m128i reduce_bits_vert_const =

321

_mm_set1_epi32(((1 << reduce_bits_vert) >> 1));

322

const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);

323

const int round_bits =

324

2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;

325

const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;

326

const __m128i res_sub_const =

327

_mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -

328

(1 << (offset_bits - conv_params->round_1 - 1)));

329

__m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);

330

__m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));

331

332

const int w0 = conv_params->fwd_offset;

333

const int w1 = conv_params->bck_offset;

334

const __m128i wt0 = _mm_set1_epi32(w0);

335

const __m128i wt1 = _mm_set1_epi32(w1);

336

337

/* Note: For this code to work, the left/right frame borders need to be

338

extended by at least 13 pixels each. By the time we get here, other

339

code will have set up this border, but we allow an explicit check

340

for debugging purposes.

341

*/

342

/*for (i = 0; i < height; ++i) {

343

for (j = 0; j < 13; ++j) {

344

assert(ref[i * stride - 13 + j] == ref[i * stride]);

345

assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);

}

}*/

for (i = 0; i < p_height; i += 8) {

350

for (j = 0; j < p_width; j += 8) {

351

const int32_t src_x = (p_col + j + 4) << subsampling_x;

352

const int32_t src_y = (p_row + i + 4) << subsampling_y;

Debargha Mukherjee

b761428

2021-10-20 23:37:58 -0700

[diff] [blame]

353

const int64_t dst_x =

354

(int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];

355

const int64_t dst_y =

356

(int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];

357

const int64_t x4 = dst_x >> subsampling_x;

358

const int64_t y4 = dst_y >> subsampling_y;

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

359

Debargha Mukherjee

b761428

2021-10-20 23:37:58 -0700

[diff] [blame]

360

int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

361

int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);

Debargha Mukherjee

b761428

2021-10-20 23:37:58 -0700

[diff] [blame]

362

int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

363

int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);

364

365

// Add in all the constant terms, including rounding and offset

366

sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +

367

(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);

368

sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +

369

(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);

370

371

sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);

372

sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);

373

374

// Horizontal filter

375

// If the block is aligned such that, after clamping, every sample

376

// would be taken from the leftmost/rightmost column, then we can

377

// skip the expensive horizontal filter.

378

if (ix4 <= -7) {

379

for (k = -7; k < AOMMIN(8, p_height - i); ++k) {

int iy = iy4 + k;

if (iy < 0)

iy = 0;

else if (iy > height - 1)

384

iy = height - 1;

385

tmp[k + 7] = _mm_set1_epi16(

386

(1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +

387

ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));

388

}

389

} else if (ix4 >= width + 6) {

390

for (k = -7; k < AOMMIN(8, p_height - i); ++k) {

int iy = iy4 + k;

if (iy < 0)

iy = 0;

else if (iy > height - 1)

395

iy = height - 1;

396

tmp[k + 7] =

397

_mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +

398

ref[iy * stride + (width - 1)] *

399

(1 << (FILTER_BITS - reduce_bits_horiz)));

400

}

Ravi Chaudhary

2018-06-22 12:26:44 +0530

[diff] [blame]

401

} else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {

402

const int out_of_boundary_left = -(ix4 - 6);

403

const int out_of_boundary_right = (ix4 + 8) - width;

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

404

405

for (k = -7; k < AOMMIN(8, p_height - i); ++k) {

int iy = iy4 + k;

if (iy < 0)

iy = 0;

else if (iy > height - 1)

410

iy = height - 1;

411

int sx = sx4 + beta * (k + 4);

412

413

// Load source pixels

414

const __m128i src =

415

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));

416

const __m128i src2 =

417

_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));

418

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

419

const __m128i src_01 = _mm_shuffle_epi8(

420

src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));

421

const __m128i src2_01 = _mm_shuffle_epi8(

422

src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));

423

Ravi Chaudhary

2018-06-22 12:26:44 +0530

[diff] [blame]

424

__m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);

425

__m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

426

Ravi Chaudhary

2018-06-22 12:26:44 +0530

[diff] [blame]

427

if (out_of_boundary_left >= 0) {

428

const __m128i shuffle_reg_left =

429

_mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);

430

src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);

431

src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);

432

}

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

433

Ravi Chaudhary

2018-06-22 12:26:44 +0530

[diff] [blame]

434

if (out_of_boundary_right >= 0) {

435

const __m128i shuffle_reg_right = _mm_loadu_si128(

436

(__m128i *)warp_pad_right[out_of_boundary_right]);

437

src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);

438

src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);

439

}

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

440

Ravi Chaudhary

2018-06-22 12:26:44 +0530

[diff] [blame]

441

const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);

442

const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

443

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

444

highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,

445

offset_bits_horiz, reduce_bits_horiz);

Ravi Chaudhary

2018-05-09 16:42:19 +0530

[diff] [blame]

446

}

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

447

} else {

Remya

2018-08-22 17:18:39 +0530

[diff] [blame]

448

highbd_prepare_warp_horizontal_filter(

449

ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,

450

offset_bits_horiz, reduce_bits_horiz);

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

}

// Vertical filter

for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {

455

int sy = sy4 + delta * (k + 4);

456

457

// Load from tmp and rearrange pairs of consecutive rows into the

458

// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7

459

const __m128i *src = tmp + (k + 4);

460

const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);

461

const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);

462

const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);

463

const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);

464

465

// Filter even-index pixels

466

const __m128i tmp_0 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

467

(__m128i *)(av1_warped_filter +

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

468

((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));

469

const __m128i tmp_2 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

470

(__m128i *)(av1_warped_filter +

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

471

((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));

472

const __m128i tmp_4 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

473

(__m128i *)(av1_warped_filter +

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

474

((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));

475

const __m128i tmp_6 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

476

(__m128i *)(av1_warped_filter +

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

477

((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));

478

479

const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);

480

const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);

481

const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);

482

const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);

483

484

const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);

485

const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);

486

const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);

487

const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);

488

489

const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);

490

const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);

491

const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);

492

const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);

493

494

const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),

495

_mm_add_epi32(res_4, res_6));

496

497

// Filter odd-index pixels

498

const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);

499

const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);

500

const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);

501

const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);

502

503

const __m128i tmp_1 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

504

(__m128i *)(av1_warped_filter +

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

505

((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));

506

const __m128i tmp_3 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

507

(__m128i *)(av1_warped_filter +

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

508

((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));

509

const __m128i tmp_5 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

510

(__m128i *)(av1_warped_filter +

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

511

((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));

512

const __m128i tmp_7 = _mm_loadu_si128(

Yaowu Xu

2019-05-01 08:40:42 -0700

[diff] [blame]

513

(__m128i *)(av1_warped_filter +

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

514

((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));

515

516

const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);

517

const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);

518

const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);

519

const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);

520

521

const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);

522

const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);

523

const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);

524

const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);

525

526

const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);

527

const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);

528

const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);

529

const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);

530

531

const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),

532

_mm_add_epi32(res_5, res_7));

533

534

// Rearrange pixels back into the order 0 ... 7

535

__m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);

536

__m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

537

538

if (conv_params->is_compound) {

539

__m128i *const p =

540

(__m128i *)&conv_params

541

->dst[(i + k + 4) * conv_params->dst_stride + j];

542

res_lo = _mm_add_epi32(res_lo, res_add_const);

543

res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),

544

reduce_bits_vert_shift);

545

546

if (conv_params->do_average) {

547

__m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];

548

__m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));

549

Debargha Mukherjee

7ac3eb1

2018-12-12 10:26:50 -0800

[diff] [blame]

550

if (conv_params->use_dist_wtd_comp_avg) {

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

551

res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),

552

_mm_mullo_epi32(res_lo, wt1));

553

res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);

554

} else {

555

res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);

556

}

557

558

__m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);

559

res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),

560

round_bits_shift);

561

562

__m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);

563

res16_lo = _mm_min_epi16(res16_lo, clip_pixel);

564

_mm_storel_epi64(dst16, res16_lo);

565

} else {

566

res_lo = _mm_packus_epi32(res_lo, res_lo);

567

_mm_storel_epi64(p, res_lo);

}

if (p_width > 4) {

__m128i *const p4 =

(__m128i *)&conv_params

572

->dst[(i + k + 4) * conv_params->dst_stride + j + 4];

573

574

res_hi = _mm_add_epi32(res_hi, res_add_const);

575

res_hi =

576

_mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),

577

reduce_bits_vert_shift);

578

if (conv_params->do_average) {

579

__m128i *const dst16_4 =

580

(__m128i *)&pred[(i + k + 4) * p_stride + j + 4];

581

__m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));

582

Debargha Mukherjee

7ac3eb1

2018-12-12 10:26:50 -0800

[diff] [blame]

583

if (conv_params->use_dist_wtd_comp_avg) {

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

584

res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),

585

_mm_mullo_epi32(res_hi, wt1));

586

res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);

587

} else {

588

res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);

589

}

590

591

__m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);

592

res32_hi = _mm_sra_epi32(

593

_mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);

594

__m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);

595

res16_hi = _mm_min_epi16(res16_hi, clip_pixel);

596

_mm_storel_epi64(dst16_4, res16_hi);

597

} else {

598

res_hi = _mm_packus_epi32(res_hi, res_hi);

599

_mm_storel_epi64(p4, res_hi);

}

}

} else {

// Round and pack into 8 bits

604

const __m128i round_const =

605

_mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +

606

((1 << reduce_bits_vert) >> 1));

607

608

const __m128i res_lo_round = _mm_srai_epi32(

609

_mm_add_epi32(res_lo, round_const), reduce_bits_vert);

610

const __m128i res_hi_round = _mm_srai_epi32(

611

_mm_add_epi32(res_hi, round_const), reduce_bits_vert);

612

613

__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);

614

// Clamp res_16bit to the range [0, 2^bd - 1]

615

const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);

616

const __m128i zero = _mm_setzero_si128();

617

res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);

618

619

// Store, blending with 'pred' if needed

620

__m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];

621

622

// Note: If we're outputting a 4x4 block, we need to be very careful

623

// to only output 4 pixels at this point, to avoid encode/decode

624

// mismatches when encoding with multiple threads.

625

if (p_width == 4) {

Cherma Rajan A

2018-03-20 10:00:51 +0530

[diff] [blame]

626

_mm_storel_epi64(p, res_16bit);

627

} else {

Cherma Rajan A