vp8/encoder/x86/preproc_mmx.c - avm - Git at Google

 /*
  *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license and patent
  *  grant that can be found in the LICENSE file in the root of the source
  *  tree. All contributing project authors may be found in the AUTHORS
  *  file in the root of the source tree.
  */


 #include "memory.h"
 #include "preproc.h"
 #include "pragmas.h"

 /****************************************************************************
 *  Macros
 ****************************************************************************/
 #define FRAMECOUNT 7
 #define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )

 /****************************************************************************
 *  Imports
 ****************************************************************************/
 extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);

 /****************************************************************************
 *  Exported Global Variables
 ****************************************************************************/
 void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);

 /****************************************************************************
  *
  *  ROUTINE       : temp_filter_wmt
  *
  *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
  *                  unsigned char *s     : Pointer to source frame.
  *                  unsigned char *d     : Pointer to destination frame.
  *                  int bytes            : Number of bytes to filter.
  *                  int strength         : Strength of filter to apply.
  *
  *  OUTPUTS       : None.
  *
  *  RETURNS       : void
  *
  *  FUNCTION      : Performs a closesness adjusted temporarl blur
  *
  *  SPECIAL NOTES : Destination frame can be same as source frame.
  *
  ****************************************************************************/
 void temp_filter_wmt
 (
     pre_proc_instance *ppi,
     unsigned char *s,
     unsigned char *d,
     int bytes,
     int strength
 )
 {
     int byte = 0;
     unsigned char *frameptr = ppi->frame_buffer;

     __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
     __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};

     if (ppi->frame == 0)
     {
         do
         {
             int i;
             int frame = 0;

             do
             {
                 for (i = 0; i < 8; i++)
                 {
                     *frameptr = s[byte+i];
                     ++frameptr;
                 }

                 ++frame;
             }
             while (frame < FRAMECOUNT);

             for (i = 0; i < 8; i++)
                 d[byte+i] = s[byte+i];

             byte += 8;

         }
         while (byte < bytes);
     }
     else
     {
         int i;
         int offset2 = (ppi->frame % FRAMECOUNT);

         do
         {
             __declspec(align(16)) unsigned short counts[8];
             __declspec(align(16)) unsigned short sums[8];
             __asm
             {
                 mov         eax, offset2
                 mov         edi, s                  // source pixels
                 pxor        xmm1, xmm1              // accumulator

                 pxor        xmm7, xmm7

                 mov         esi, frameptr           // accumulator
                 pxor        xmm2, xmm2              // count

                 movq        xmm3, QWORD PTR [edi]

                 movq        QWORD PTR [esi+8*eax], xmm3

                 punpcklbw   xmm3, xmm2              // xmm3 source pixels
                 mov         ecx,  FRAMECOUNT

                 next_frame:
                 movq        xmm4, QWORD PTR [esi]   // get frame buffer values
                 punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
                 movdqa      xmm6, xmm4              // save the pixel values
                 psubsw      xmm4, xmm3              // subtracted pixel values
                 pmullw      xmm4, xmm4              // square xmm4
                 movd        xmm5, strength
                 psrlw       xmm4, xmm5              // should be strength
                 pmullw      xmm4, threes            // 3 * modifier
                 movdqa      xmm5, sixteens          // 16s
                 psubusw     xmm5, xmm4              // 16 - modifiers
                 movdqa      xmm4, xmm5              // save the modifiers
                 pmullw      xmm4, xmm6              // multiplier values
                 paddusw     xmm1, xmm4              // accumulator
                 paddusw     xmm2, xmm5              // count
                 add         esi, 8                  // next frame
                 dec         ecx                     // next set of eight pixels
                 jnz         next_frame

                 movdqa      counts, xmm2
                 psrlw       xmm2, 1                 // divide count by 2 for rounding
                 paddusw     xmm1, xmm2              // rounding added in

                 mov         frameptr, esi

                 movdqa      sums, xmm1
             }

             for (i = 0; i < 8; i++)
             {
                 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
                 blurvalue >>= 16;
                 d[i] = blurvalue;
             }

             s += 8;
             d += 8;
             byte += 8;
         }
         while (byte < bytes);
     }

     ++ppi->frame;
     __asm emms
 }

 /****************************************************************************
  *
  *  ROUTINE       : temp_filter_mmx
  *
  *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
  *                  unsigned char *s     : Pointer to source frame.
  *                  unsigned char *d     : Pointer to destination frame.
  *                  int bytes            : Number of bytes to filter.
  *                  int strength         : Strength of filter to apply.
  *
  *  OUTPUTS       : None.
  *
  *  RETURNS       : void
  *
  *  FUNCTION      : Performs a closesness adjusted temporarl blur
  *
  *  SPECIAL NOTES : Destination frame can be same as source frame.
  *
  ****************************************************************************/
 void temp_filter_mmx
 (
     pre_proc_instance *ppi,
     unsigned char *s,
     unsigned char *d,
     int bytes,
     int strength
 )
 {
     int byte = 0;
     unsigned char *frameptr = ppi->frame_buffer;

     __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
     __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};

     if (ppi->frame == 0)
     {
         do
         {
             int i;
             int frame = 0;

             do
             {
                 for (i = 0; i < 4; i++)
                 {
                     *frameptr = s[byte+i];
                     ++frameptr;
                 }

                 ++frame;
             }
             while (frame < FRAMECOUNT);

             for (i = 0; i < 4; i++)
                 d[byte+i] = s[byte+i];

             byte += 4;

         }
         while (byte < bytes);
     }
     else
     {
         int i;
         int offset2 = (ppi->frame % FRAMECOUNT);

         do
         {
             __declspec(align(16)) unsigned short counts[8];
             __declspec(align(16)) unsigned short sums[8];
             __asm
             {

                 mov         eax, offset2
                 mov         edi, s                  // source pixels
                 pxor        mm1, mm1                // accumulator
                 pxor        mm7, mm7

                 mov         esi, frameptr           // accumulator
                 pxor        mm2, mm2                // count

                 movd        mm3, DWORD PTR [edi]
                 movd        DWORD PTR [esi+4*eax], mm3

                 punpcklbw   mm3, mm2                // mm3 source pixels
                 mov         ecx,  FRAMECOUNT

                 next_frame:
                 movd        mm4, DWORD PTR [esi]    // get frame buffer values
                 punpcklbw   mm4, mm7                // mm4 frame buffer pixels
                 movq        mm6, mm4                // save the pixel values
                 psubsw      mm4, mm3                // subtracted pixel values
                 pmullw      mm4, mm4                // square mm4
                 movd        mm5, strength
                 psrlw       mm4, mm5                // should be strength
                 pmullw      mm4, threes             // 3 * modifier
                 movq        mm5, sixteens           // 16s
                 psubusw     mm5, mm4                // 16 - modifiers
                 movq        mm4, mm5                // save the modifiers
                 pmullw      mm4, mm6                // multiplier values
                 paddusw     mm1, mm4                // accumulator
                 paddusw     mm2, mm5                // count
                 add         esi, 4                  // next frame
                 dec         ecx                     // next set of eight pixels
                 jnz         next_frame

                 movq        counts, mm2
                 psrlw       mm2, 1                  // divide count by 2 for rounding
                 paddusw     mm1, mm2                // rounding added in

                 mov         frameptr, esi

                 movq        sums, mm1

             }

             for (i = 0; i < 4; i++)
             {
                 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
                 blurvalue >>= 16;
                 d[i] = blurvalue;
             }

             s += 4;
             d += 4;
             byte += 4;
         }
         while (byte < bytes);
     }

     ++ppi->frame;
     __asm emms
 }
	/*
	* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license and patent
	* grant that can be found in the LICENSE file in the root of the source
	* tree. All contributing project authors may be found in the AUTHORS
	* file in the root of the source tree.
	*/


	#include "memory.h"
	#include "preproc.h"
	#include "pragmas.h"

	/****************************************************************************
	* Macros
	****************************************************************************/
	#define FRAMECOUNT 7
	#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )

	/****************************************************************************
	* Imports
	****************************************************************************/
	extern void vpx_get_processor_flags(int mmx_enabled, int xmm_enabled, int *wmt_enabled);

	/****************************************************************************
	* Exported Global Variables
	****************************************************************************/
	void (temp_filter)(pre_proc_instance ppi, unsigned char s, unsigned char d, int bytes, int strength);

	/****************************************************************************
	*
	* ROUTINE : temp_filter_wmt
	*
	* INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
	* unsigned char *s : Pointer to source frame.
	* unsigned char *d : Pointer to destination frame.
	* int bytes : Number of bytes to filter.
	* int strength : Strength of filter to apply.
	*
	* OUTPUTS : None.
	*
	* RETURNS : void
	*
	* FUNCTION : Performs a closesness adjusted temporarl blur
	*
	* SPECIAL NOTES : Destination frame can be same as source frame.
	*
	****************************************************************************/
	void temp_filter_wmt
	(
	pre_proc_instance *ppi,
	unsigned char *s,
	unsigned char *d,
	int bytes,
	int strength
	)
	{
	int byte = 0;
	unsigned char *frameptr = ppi->frame_buffer;

	__declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3};
	__declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};

	if (ppi->frame == 0)
	{
	do
	{
	int i;
	int frame = 0;

	do
	{
	for (i = 0; i < 8; i++)
	{
	*frameptr = s[byte+i];
	++frameptr;
	}

	++frame;
	}
	while (frame < FRAMECOUNT);

	for (i = 0; i < 8; i++)
	d[byte+i] = s[byte+i];

	byte += 8;

	}
	while (byte < bytes);
	}
	else
	{
	int i;
	int offset2 = (ppi->frame % FRAMECOUNT);

	do
	{
	__declspec(align(16)) unsigned short counts[8];
	__declspec(align(16)) unsigned short sums[8];
	__asm
	{
	mov eax, offset2
	mov edi, s // source pixels
	pxor xmm1, xmm1 // accumulator

	pxor xmm7, xmm7

	mov esi, frameptr // accumulator
	pxor xmm2, xmm2 // count

	movq xmm3, QWORD PTR [edi]

	movq QWORD PTR [esi+8*eax], xmm3

	punpcklbw xmm3, xmm2 // xmm3 source pixels
	mov ecx, FRAMECOUNT

	next_frame:
	movq xmm4, QWORD PTR [esi] // get frame buffer values
	punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels
	movdqa xmm6, xmm4 // save the pixel values
	psubsw xmm4, xmm3 // subtracted pixel values
	pmullw xmm4, xmm4 // square xmm4
	movd xmm5, strength
	psrlw xmm4, xmm5 // should be strength
	pmullw xmm4, threes // 3 * modifier
	movdqa xmm5, sixteens // 16s
	psubusw xmm5, xmm4 // 16 - modifiers
	movdqa xmm4, xmm5 // save the modifiers
	pmullw xmm4, xmm6 // multiplier values
	paddusw xmm1, xmm4 // accumulator
	paddusw xmm2, xmm5 // count
	add esi, 8 // next frame
	dec ecx // next set of eight pixels
	jnz next_frame

	movdqa counts, xmm2
	psrlw xmm2, 1 // divide count by 2 for rounding
	paddusw xmm1, xmm2 // rounding added in

	mov frameptr, esi

	movdqa sums, xmm1
	}

	for (i = 0; i < 8; i++)
	{
	int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
	blurvalue >>= 16;
	d[i] = blurvalue;
	}

	s += 8;
	d += 8;
	byte += 8;
	}
	while (byte < bytes);
	}

	++ppi->frame;
	__asm emms
	}

	/****************************************************************************
	*
	* ROUTINE : temp_filter_mmx
	*
	* INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
	* unsigned char *s : Pointer to source frame.
	* unsigned char *d : Pointer to destination frame.
	* int bytes : Number of bytes to filter.
	* int strength : Strength of filter to apply.
	*
	* OUTPUTS : None.
	*
	* RETURNS : void
	*
	* FUNCTION : Performs a closesness adjusted temporarl blur
	*
	* SPECIAL NOTES : Destination frame can be same as source frame.
	*
	****************************************************************************/
	void temp_filter_mmx
	(
	pre_proc_instance *ppi,
	unsigned char *s,
	unsigned char *d,
	int bytes,
	int strength
	)
	{
	int byte = 0;
	unsigned char *frameptr = ppi->frame_buffer;

	__declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3};
	__declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};

	if (ppi->frame == 0)
	{
	do
	{
	int i;
	int frame = 0;

	do
	{
	for (i = 0; i < 4; i++)
	{
	*frameptr = s[byte+i];
	++frameptr;
	}

	++frame;
	}
	while (frame < FRAMECOUNT);

	for (i = 0; i < 4; i++)
	d[byte+i] = s[byte+i];

	byte += 4;

	}
	while (byte < bytes);
	}
	else
	{
	int i;
	int offset2 = (ppi->frame % FRAMECOUNT);

	do
	{
	__declspec(align(16)) unsigned short counts[8];
	__declspec(align(16)) unsigned short sums[8];
	__asm
	{

	mov eax, offset2
	mov edi, s // source pixels
	pxor mm1, mm1 // accumulator
	pxor mm7, mm7

	mov esi, frameptr // accumulator
	pxor mm2, mm2 // count

	movd mm3, DWORD PTR [edi]
	movd DWORD PTR [esi+4*eax], mm3

	punpcklbw mm3, mm2 // mm3 source pixels
	mov ecx, FRAMECOUNT

	next_frame:
	movd mm4, DWORD PTR [esi] // get frame buffer values
	punpcklbw mm4, mm7 // mm4 frame buffer pixels
	movq mm6, mm4 // save the pixel values
	psubsw mm4, mm3 // subtracted pixel values
	pmullw mm4, mm4 // square mm4
	movd mm5, strength
	psrlw mm4, mm5 // should be strength
	pmullw mm4, threes // 3 * modifier
	movq mm5, sixteens // 16s
	psubusw mm5, mm4 // 16 - modifiers
	movq mm4, mm5 // save the modifiers
	pmullw mm4, mm6 // multiplier values
	paddusw mm1, mm4 // accumulator
	paddusw mm2, mm5 // count
	add esi, 4 // next frame
	dec ecx // next set of eight pixels
	jnz next_frame

	movq counts, mm2
	psrlw mm2, 1 // divide count by 2 for rounding
	paddusw mm1, mm2 // rounding added in

	mov frameptr, esi

	movq sums, mm1

	}

	for (i = 0; i < 4; i++)
	{
	int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
	blurvalue >>= 16;
	d[i] = blurvalue;
	}

	s += 4;
	d += 4;
	byte += 4;
	}
	while (byte < bytes);
	}

	++ppi->frame;
	__asm emms
	}