aom_dsp/x86/masked_sad_intrin_ssse3.c - aom - Git at Google

 /*
  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <stdlib.h>
 #include <emmintrin.h>
 #include <tmmintrin.h>

 #include "aom_ports/mem.h"
 #include "./aom_config.h"
 #include "aom/aom_integer.h"

 static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
   __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
   __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
   return _mm_unpacklo_epi64(temp1, temp2);
 }

 static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
   __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
   __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
   __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
   temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
   temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
   temp1 = _mm_unpacklo_epi32(temp1, temp2);
   return _mm_unpacklo_epi64(temp3, temp1);
 }

 static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
                                             const uint8_t *b_ptr, int b_stride,
                                             const uint8_t *m_ptr, int m_stride,
                                             int width, int height);

 static INLINE unsigned int masked_sad8xh_ssse3(
     const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
     const uint8_t *m_ptr, int m_stride, int height);

 static INLINE unsigned int masked_sad4xh_ssse3(
     const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
     const uint8_t *m_ptr, int m_stride, int height);

 #define MASKSADMXN_SSSE3(m, n)                                                 \
   unsigned int aom_masked_sad##m##x##n##_ssse3(                                \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
       const uint8_t *msk, int msk_stride) {                                    \
     return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
                             m, n);                                             \
   }

 #if CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(128, 128)
 MASKSADMXN_SSSE3(128, 64)
 MASKSADMXN_SSSE3(64, 128)
 #endif  // CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(64, 64)
 MASKSADMXN_SSSE3(64, 32)
 MASKSADMXN_SSSE3(32, 64)
 MASKSADMXN_SSSE3(32, 32)
 MASKSADMXN_SSSE3(32, 16)
 MASKSADMXN_SSSE3(16, 32)
 MASKSADMXN_SSSE3(16, 16)
 MASKSADMXN_SSSE3(16, 8)

 #define MASKSAD8XN_SSSE3(n)                                                   \
   unsigned int aom_masked_sad8x##n##_ssse3(                                   \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
       const uint8_t *msk, int msk_stride) {                                   \
     return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
                                msk_stride, n);                                \
   }

 MASKSAD8XN_SSSE3(16)
 MASKSAD8XN_SSSE3(8)
 MASKSAD8XN_SSSE3(4)

 #define MASKSAD4XN_SSSE3(n)                                                   \
   unsigned int aom_masked_sad4x##n##_ssse3(                                   \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
       const uint8_t *msk, int msk_stride) {                                   \
     return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
                                msk_stride, n);                                \
   }

 MASKSAD4XN_SSSE3(8)
 MASKSAD4XN_SSSE3(4)

 // For width a multiple of 16
 // Assumes values in m are <=64
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
                                             const uint8_t *b_ptr, int b_stride,
                                             const uint8_t *m_ptr, int m_stride,
                                             int width, int height) {
   int y, x;
   __m128i a, b, m, temp1, temp2;
   __m128i res = _mm_setzero_si128();
   __m128i one = _mm_set1_epi16(1);
   // For each row
   for (y = 0; y < height; y++) {
     // Covering the full width
     for (x = 0; x < width; x += 16) {
       // Load a, b, m in xmm registers
       a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
       b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
       m = _mm_loadu_si128((const __m128i *)(m_ptr + x));

       // Calculate the difference between a & b
       temp1 = _mm_subs_epu8(a, b);
       temp2 = _mm_subs_epu8(b, a);
       temp1 = _mm_or_si128(temp1, temp2);

       // Multiply by m and add together
       temp2 = _mm_maddubs_epi16(temp1, m);
       // Pad out row result to 32 bit integers & add to running total
       res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
     }
     // Move onto the next row
     a_ptr += a_stride;
     b_ptr += b_stride;
     m_ptr += m_stride;
   }
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   // sad = (sad + 31) >> 6;
   return (_mm_cvtsi128_si32(res) + 31) >> 6;
 }

 static INLINE unsigned int masked_sad8xh_ssse3(
     const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
     const uint8_t *m_ptr, int m_stride, int height) {
   int y;
   __m128i a, b, m, temp1, temp2, row_res;
   __m128i res = _mm_setzero_si128();
   __m128i one = _mm_set1_epi16(1);
   // Add the masked SAD for 2 rows at a time
   for (y = 0; y < height; y += 2) {
     // Load a, b, m in xmm registers
     a = width8_load_2rows(a_ptr, a_stride);
     b = width8_load_2rows(b_ptr, b_stride);
     m = width8_load_2rows(m_ptr, m_stride);

     // Calculate the difference between a & b
     temp1 = _mm_subs_epu8(a, b);
     temp2 = _mm_subs_epu8(b, a);
     temp1 = _mm_or_si128(temp1, temp2);

     // Multiply by m and add together
     row_res = _mm_maddubs_epi16(temp1, m);

     // Pad out row result to 32 bit integers & add to running total
     res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));

     // Move onto the next rows
     a_ptr += a_stride * 2;
     b_ptr += b_stride * 2;
     m_ptr += m_stride * 2;
   }
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   // sad = (sad + 31) >> 6;
   return (_mm_cvtsi128_si32(res) + 31) >> 6;
 }

 static INLINE unsigned int masked_sad4xh_ssse3(
     const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
     const uint8_t *m_ptr, int m_stride, int height) {
   int y;
   __m128i a, b, m, temp1, temp2, row_res;
   __m128i res = _mm_setzero_si128();
   __m128i one = _mm_set1_epi16(1);
   // Add the masked SAD for 4 rows at a time
   for (y = 0; y < height; y += 4) {
     // Load a, b, m in xmm registers
     a = width4_load_4rows(a_ptr, a_stride);
     b = width4_load_4rows(b_ptr, b_stride);
     m = width4_load_4rows(m_ptr, m_stride);

     // Calculate the difference between a & b
     temp1 = _mm_subs_epu8(a, b);
     temp2 = _mm_subs_epu8(b, a);
     temp1 = _mm_or_si128(temp1, temp2);

     // Multiply by m and add together
     row_res = _mm_maddubs_epi16(temp1, m);

     // Pad out row result to 32 bit integers & add to running total
     res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));

     // Move onto the next rows
     a_ptr += a_stride * 4;
     b_ptr += b_stride * 4;
     m_ptr += m_stride * 4;
   }
   // Pad out row result to 32 bit integers & add to running total
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   // sad = (sad + 31) >> 6;
   return (_mm_cvtsi128_si32(res) + 31) >> 6;
 }

 #if CONFIG_AOM_HIGHBITDEPTH
 static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
                                                int stride) {
   __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
   __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
   return _mm_unpacklo_epi64(temp1, temp2);
 }

 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
     const uint8_t *m_ptr, int m_stride, int width, int height);

 static INLINE unsigned int highbd_masked_sad4xh_ssse3(
     const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
     const uint8_t *m_ptr, int m_stride, int height);

 #define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
   unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
       const uint8_t *msk, int msk_stride) {                                   \
     return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk,     \
                                    msk_stride, m, n);                         \
   }

 #if CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(128, 128)
 HIGHBD_MASKSADMXN_SSSE3(128, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 128)
 #endif  // CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(64, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 64)
 HIGHBD_MASKSADMXN_SSSE3(32, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 16)
 HIGHBD_MASKSADMXN_SSSE3(16, 32)
 HIGHBD_MASKSADMXN_SSSE3(16, 16)
 HIGHBD_MASKSADMXN_SSSE3(16, 8)
 HIGHBD_MASKSADMXN_SSSE3(8, 16)
 HIGHBD_MASKSADMXN_SSSE3(8, 8)
 HIGHBD_MASKSADMXN_SSSE3(8, 4)

 #define HIGHBD_MASKSAD4XN_SSSE3(n)                                            \
   unsigned int aom_highbd_masked_sad4x##n##_ssse3(                            \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
       const uint8_t *msk, int msk_stride) {                                   \
     return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,  \
                                       msk_stride, n);                         \
   }

 HIGHBD_MASKSAD4XN_SSSE3(8)
 HIGHBD_MASKSAD4XN_SSSE3(4)

 // For width a multiple of 8
 // Assumes values in m are <=64
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
     const uint8_t *m_ptr, int m_stride, int width, int height) {
   int y, x;
   __m128i a, b, m, temp1, temp2;
   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
   const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
   __m128i res = _mm_setzero_si128();
   // For each row
   for (y = 0; y < height; y++) {
     // Covering the full width
     for (x = 0; x < width; x += 8) {
       // Load a, b, m in xmm registers
       a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
       b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
       m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
                             _mm_setzero_si128());

       // Calculate the difference between a & b
       temp1 = _mm_subs_epu16(a, b);
       temp2 = _mm_subs_epu16(b, a);
       temp1 = _mm_or_si128(temp1, temp2);

       // Add result of multiplying by m and add pairs together to running total
       res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
     }
     // Move onto the next row
     a_ptr += a_stride;
     b_ptr += b_stride;
     m_ptr += m_stride;
   }
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   // sad = (sad + 31) >> 6;
   return (_mm_cvtsi128_si32(res) + 31) >> 6;
 }

 static INLINE unsigned int highbd_masked_sad4xh_ssse3(
     const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
     const uint8_t *m_ptr, int m_stride, int height) {
   int y;
   __m128i a, b, m, temp1, temp2;
   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
   const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
   __m128i res = _mm_setzero_si128();
   // Add the masked SAD for 2 rows at a time
   for (y = 0; y < height; y += 2) {
     // Load a, b, m in xmm registers
     a = highbd_width4_load_2rows(a_ptr, a_stride);
     b = highbd_width4_load_2rows(b_ptr, b_stride);
     temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
     temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
     m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
                           _mm_setzero_si128());

     // Calculate the difference between a & b
     temp1 = _mm_subs_epu16(a, b);
     temp2 = _mm_subs_epu16(b, a);
     temp1 = _mm_or_si128(temp1, temp2);

     // Multiply by m and add together
     res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));

     // Move onto the next rows
     a_ptr += a_stride * 2;
     b_ptr += b_stride * 2;
     m_ptr += m_stride * 2;
   }
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   res = _mm_hadd_epi32(res, _mm_setzero_si128());
   // sad = (sad + 31) >> 6;
   return (_mm_cvtsi128_si32(res) + 31) >> 6;
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
	/*
	* Copyright (c) 2016, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <stdlib.h>
	#include <emmintrin.h>
	#include <tmmintrin.h>

	#include "aom_ports/mem.h"
	#include "./aom_config.h"
	#include "aom/aom_integer.h"

	static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
	__m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
	__m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
	return _mm_unpacklo_epi64(temp1, temp2);
	}

	static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
	__m128i temp1 = _mm_cvtsi32_si128((const uint32_t )ptr);
	__m128i temp2 = _mm_cvtsi32_si128((const uint32_t )(ptr + stride));
	__m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
	temp1 = _mm_cvtsi32_si128((const uint32_t )(ptr + stride * 2));
	temp2 = _mm_cvtsi32_si128((const uint32_t )(ptr + stride * 3));
	temp1 = _mm_unpacklo_epi32(temp1, temp2);
	return _mm_unpacklo_epi64(temp3, temp1);
	}

	static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
	const uint8_t *b_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride,
	int width, int height);

	static INLINE unsigned int masked_sad8xh_ssse3(
	const uint8_t a_ptr, int a_stride, const uint8_t b_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride, int height);

	static INLINE unsigned int masked_sad4xh_ssse3(
	const uint8_t a_ptr, int a_stride, const uint8_t b_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride, int height);

	#define MASKSADMXN_SSSE3(m, n) \
	unsigned int aom_masked_sad##m##x##n##_ssse3( \
	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
	const uint8_t *msk, int msk_stride) { \
	return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
	m, n); \
	}

	#if CONFIG_EXT_PARTITION
	MASKSADMXN_SSSE3(128, 128)
	MASKSADMXN_SSSE3(128, 64)
	MASKSADMXN_SSSE3(64, 128)
	#endif // CONFIG_EXT_PARTITION
	MASKSADMXN_SSSE3(64, 64)
	MASKSADMXN_SSSE3(64, 32)
	MASKSADMXN_SSSE3(32, 64)
	MASKSADMXN_SSSE3(32, 32)
	MASKSADMXN_SSSE3(32, 16)
	MASKSADMXN_SSSE3(16, 32)
	MASKSADMXN_SSSE3(16, 16)
	MASKSADMXN_SSSE3(16, 8)

	#define MASKSAD8XN_SSSE3(n) \
	unsigned int aom_masked_sad8x##n##_ssse3( \
	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
	const uint8_t *msk, int msk_stride) { \
	return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
	msk_stride, n); \
	}

	MASKSAD8XN_SSSE3(16)
	MASKSAD8XN_SSSE3(8)
	MASKSAD8XN_SSSE3(4)

	#define MASKSAD4XN_SSSE3(n) \
	unsigned int aom_masked_sad4x##n##_ssse3( \
	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
	const uint8_t *msk, int msk_stride) { \
	return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
	msk_stride, n); \
	}

	MASKSAD4XN_SSSE3(8)
	MASKSAD4XN_SSSE3(4)

	// For width a multiple of 16
	// Assumes values in m are <=64
	static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
	const uint8_t *b_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride,
	int width, int height) {
	int y, x;
	__m128i a, b, m, temp1, temp2;
	__m128i res = _mm_setzero_si128();
	__m128i one = _mm_set1_epi16(1);
	// For each row
	for (y = 0; y < height; y++) {
	// Covering the full width
	for (x = 0; x < width; x += 16) {
	// Load a, b, m in xmm registers
	a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
	b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
	m = _mm_loadu_si128((const __m128i *)(m_ptr + x));

	// Calculate the difference between a & b
	temp1 = _mm_subs_epu8(a, b);
	temp2 = _mm_subs_epu8(b, a);
	temp1 = _mm_or_si128(temp1, temp2);

	// Multiply by m and add together
	temp2 = _mm_maddubs_epi16(temp1, m);
	// Pad out row result to 32 bit integers & add to running total
	res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
	}
	// Move onto the next row
	a_ptr += a_stride;
	b_ptr += b_stride;
	m_ptr += m_stride;
	}
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	// sad = (sad + 31) >> 6;
	return (_mm_cvtsi128_si32(res) + 31) >> 6;
	}

	static INLINE unsigned int masked_sad8xh_ssse3(
	const uint8_t a_ptr, int a_stride, const uint8_t b_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride, int height) {
	int y;
	__m128i a, b, m, temp1, temp2, row_res;
	__m128i res = _mm_setzero_si128();
	__m128i one = _mm_set1_epi16(1);
	// Add the masked SAD for 2 rows at a time
	for (y = 0; y < height; y += 2) {
	// Load a, b, m in xmm registers
	a = width8_load_2rows(a_ptr, a_stride);
	b = width8_load_2rows(b_ptr, b_stride);
	m = width8_load_2rows(m_ptr, m_stride);

	// Calculate the difference between a & b
	temp1 = _mm_subs_epu8(a, b);
	temp2 = _mm_subs_epu8(b, a);
	temp1 = _mm_or_si128(temp1, temp2);

	// Multiply by m and add together
	row_res = _mm_maddubs_epi16(temp1, m);

	// Pad out row result to 32 bit integers & add to running total
	res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));

	// Move onto the next rows
	a_ptr += a_stride * 2;
	b_ptr += b_stride * 2;
	m_ptr += m_stride * 2;
	}
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	// sad = (sad + 31) >> 6;
	return (_mm_cvtsi128_si32(res) + 31) >> 6;
	}

	static INLINE unsigned int masked_sad4xh_ssse3(
	const uint8_t a_ptr, int a_stride, const uint8_t b_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride, int height) {
	int y;
	__m128i a, b, m, temp1, temp2, row_res;
	__m128i res = _mm_setzero_si128();
	__m128i one = _mm_set1_epi16(1);
	// Add the masked SAD for 4 rows at a time
	for (y = 0; y < height; y += 4) {
	// Load a, b, m in xmm registers
	a = width4_load_4rows(a_ptr, a_stride);
	b = width4_load_4rows(b_ptr, b_stride);
	m = width4_load_4rows(m_ptr, m_stride);

	// Calculate the difference between a & b
	temp1 = _mm_subs_epu8(a, b);
	temp2 = _mm_subs_epu8(b, a);
	temp1 = _mm_or_si128(temp1, temp2);

	// Multiply by m and add together
	row_res = _mm_maddubs_epi16(temp1, m);

	// Pad out row result to 32 bit integers & add to running total
	res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));

	// Move onto the next rows
	a_ptr += a_stride * 4;
	b_ptr += b_stride * 4;
	m_ptr += m_stride * 4;
	}
	// Pad out row result to 32 bit integers & add to running total
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	// sad = (sad + 31) >> 6;
	return (_mm_cvtsi128_si32(res) + 31) >> 6;
	}

	#if CONFIG_AOM_HIGHBITDEPTH
	static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
	int stride) {
	__m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
	__m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
	return _mm_unpacklo_epi64(temp1, temp2);
	}

	static INLINE unsigned int highbd_masked_sad_ssse3(
	const uint8_t a8_ptr, int a_stride, const uint8_t b8_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride, int width, int height);

	static INLINE unsigned int highbd_masked_sad4xh_ssse3(
	const uint8_t a8_ptr, int a_stride, const uint8_t b8_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride, int height);

	#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
	unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
	const uint8_t *msk, int msk_stride) { \
	return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
	msk_stride, m, n); \
	}

	#if CONFIG_EXT_PARTITION
	HIGHBD_MASKSADMXN_SSSE3(128, 128)
	HIGHBD_MASKSADMXN_SSSE3(128, 64)
	HIGHBD_MASKSADMXN_SSSE3(64, 128)
	#endif // CONFIG_EXT_PARTITION
	HIGHBD_MASKSADMXN_SSSE3(64, 64)
	HIGHBD_MASKSADMXN_SSSE3(64, 32)
	HIGHBD_MASKSADMXN_SSSE3(32, 64)
	HIGHBD_MASKSADMXN_SSSE3(32, 32)
	HIGHBD_MASKSADMXN_SSSE3(32, 16)
	HIGHBD_MASKSADMXN_SSSE3(16, 32)
	HIGHBD_MASKSADMXN_SSSE3(16, 16)
	HIGHBD_MASKSADMXN_SSSE3(16, 8)
	HIGHBD_MASKSADMXN_SSSE3(8, 16)
	HIGHBD_MASKSADMXN_SSSE3(8, 8)
	HIGHBD_MASKSADMXN_SSSE3(8, 4)

	#define HIGHBD_MASKSAD4XN_SSSE3(n) \
	unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
	const uint8_t *msk, int msk_stride) { \
	return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
	msk_stride, n); \
	}

	HIGHBD_MASKSAD4XN_SSSE3(8)
	HIGHBD_MASKSAD4XN_SSSE3(4)

	// For width a multiple of 8
	// Assumes values in m are <=64
	static INLINE unsigned int highbd_masked_sad_ssse3(
	const uint8_t a8_ptr, int a_stride, const uint8_t b8_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride, int width, int height) {
	int y, x;
	__m128i a, b, m, temp1, temp2;
	const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
	const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
	__m128i res = _mm_setzero_si128();
	// For each row
	for (y = 0; y < height; y++) {
	// Covering the full width
	for (x = 0; x < width; x += 8) {
	// Load a, b, m in xmm registers
	a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
	b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
	m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
	_mm_setzero_si128());

	// Calculate the difference between a & b
	temp1 = _mm_subs_epu16(a, b);
	temp2 = _mm_subs_epu16(b, a);
	temp1 = _mm_or_si128(temp1, temp2);

	// Add result of multiplying by m and add pairs together to running total
	res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
	}
	// Move onto the next row
	a_ptr += a_stride;
	b_ptr += b_stride;
	m_ptr += m_stride;
	}
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	// sad = (sad + 31) >> 6;
	return (_mm_cvtsi128_si32(res) + 31) >> 6;
	}

	static INLINE unsigned int highbd_masked_sad4xh_ssse3(
	const uint8_t a8_ptr, int a_stride, const uint8_t b8_ptr, int b_stride,
	const uint8_t *m_ptr, int m_stride, int height) {
	int y;
	__m128i a, b, m, temp1, temp2;
	const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
	const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
	__m128i res = _mm_setzero_si128();
	// Add the masked SAD for 2 rows at a time
	for (y = 0; y < height; y += 2) {
	// Load a, b, m in xmm registers
	a = highbd_width4_load_2rows(a_ptr, a_stride);
	b = highbd_width4_load_2rows(b_ptr, b_stride);
	temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
	temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
	m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
	_mm_setzero_si128());

	// Calculate the difference between a & b
	temp1 = _mm_subs_epu16(a, b);
	temp2 = _mm_subs_epu16(b, a);
	temp1 = _mm_or_si128(temp1, temp2);

	// Multiply by m and add together
	res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));

	// Move onto the next rows
	a_ptr += a_stride * 2;
	b_ptr += b_stride * 2;
	m_ptr += m_stride * 2;
	}
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	res = _mm_hadd_epi32(res, _mm_setzero_si128());
	// sad = (sad + 31) >> 6;
	return (_mm_cvtsi128_si32(res) + 31) >> 6;
	}
	#endif // CONFIG_AOM_HIGHBITDEPTH