aom_dsp/x86/subpel_variance_sse2.asm - aom - Git at Google

 ;
 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 ; was not distributed with this source code in the LICENSE file, you can
 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
 ; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;

 ;

 %include "third_party/x86inc/x86inc.asm"

 SECTION_RODATA
 pw_8: times  8 dw  8
 bilin_filter_m_sse2: times  8 dw 16
                      times  8 dw  0
                      times  8 dw 14
                      times  8 dw  2
                      times  8 dw 12
                      times  8 dw  4
                      times  8 dw 10
                      times  8 dw  6
                      times 16 dw  8
                      times  8 dw  6
                      times  8 dw 10
                      times  8 dw  4
                      times  8 dw 12
                      times  8 dw  2
                      times  8 dw 14

 bilin_filter_m_ssse3: times  8 db 16,  0
                       times  8 db 14,  2
                       times  8 db 12,  4
                       times  8 db 10,  6
                       times 16 db  8
                       times  8 db  6, 10
                       times  8 db  4, 12
                       times  8 db  2, 14

 SECTION .text

 ; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
 ;                               int height, unsigned int *sse);
 ;
 ; This function returns the SE and stores SSE in the given pointer.

 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
   psubw                %3, %4
   psubw                %1, %2
   paddw                %5, %3
   pmaddwd              %3, %3
   paddw                %5, %1
   pmaddwd              %1, %1
   paddd                %6, %3
   paddd                %6, %1
 %endmacro

 %macro STORE_AND_RET 1
 %if %1 > 4
   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
   ; We have to sign-extend it before adding the words within the register
   ; and outputing to a dword.
   pcmpgtw              m5, m6           ; mask for 0 > x
   movhlps              m3, m7
   punpcklwd            m4, m6, m5
   punpckhwd            m6, m5           ; sign-extend m6 word->dword
   paddd                m7, m3
   paddd                m6, m4
   pshufd               m3, m7, 0x1
   movhlps              m4, m6
   paddd                m7, m3
   paddd                m6, m4
   mov                  r1, ssem         ; r1 = unsigned int *sse
   pshufd               m4, m6, 0x1
   movd               [r1], m7           ; store sse
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
 %else ; 4xh
   pshuflw              m4, m6, 0xe
   pshuflw              m3, m7, 0xe
   paddw                m6, m4
   paddd                m7, m3
   pcmpgtw              m5, m6           ; mask for 0 > x
   mov                  r1, ssem         ; r1 = unsigned int *sse
   punpcklwd            m6, m5           ; sign-extend m6 word->dword
   movd               [r1], m7           ; store sse
   pshuflw              m4, m6, 0xe
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
 %endif
   RET
 %endmacro

 %macro INC_SRC_BY_SRC_STRIDE  0
 %if ARCH_X86=1 && CONFIG_PIC=1
   add                srcq, src_stridemp
 %else
   add                srcq, src_strideq
 %endif
 %endmacro

 %macro SUBPEL_VARIANCE 1-2 0 ; W
 %if cpuflag(ssse3)
 %define bilin_filter_m bilin_filter_m_ssse3
 %define filter_idx_shift 4
 %else
 %define bilin_filter_m bilin_filter_m_sse2
 %define filter_idx_shift 5
 %endif
 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64

 %ifdef PIC    ; 64bit PIC
   %if %2 == 1 ; avg
     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
                                       x_offset, y_offset, \
                                       dst, dst_stride, \
                                       sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
     cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
                                   y_offset, dst, dst_stride, height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
   %if ARCH_X86=1 && CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
                                   x_offset, y_offset, \
                                   dst, dst_stride, \
                                   sec, sec_stride, \
                                   height, sse, g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp

       ;Store bilin_filter and pw_8 location in stack
       %if GET_GOT_DEFINED == 1
         GET_GOT eax
         add esp, 4                ; restore esp
       %endif

       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx

       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx

       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
                                 y_offset, dst, dst_stride, height, sse, \
                                 g_bilin_filter, g_pw_8
       %define block_height heightd

       ;Store bilin_filter and pw_8 location in stack
       %if GET_GOT_DEFINED == 1
         GET_GOT eax
         add esp, 4                ; restore esp
       %endif

       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx

       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx

       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
                         7 + 2 * ARCH_X86_64, 13, src, src_stride, \
                                              x_offset, y_offset, \
                                              dst, dst_stride, \
                                              sec, sec_stride, \
                                              height, sse
       %if ARCH_X86_64
       %define block_height heightd
       %define sec_str sec_strideq
       %else
       %define block_height dword heightm
       %define sec_str sec_stridemp
       %endif
     %else
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
                               y_offset, dst, dst_stride, height, sse
       %define block_height heightd
     %endif

     %define bilin_filter bilin_filter_m
   %endif
 %endif

 %if %1 == 4
   %define movx movd
 %else
   %define movx movh
 %endif

   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   pxor                 m6, m6           ; sum
   pxor                 m7, m7           ; sse
   ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
   ; could perhaps use it for something more productive then
   pxor                 m5, m5           ; dedicated zero register
 %if %1 < 16
   sar                   block_height, 1
 %if %2 == 1 ; avg
   shl             sec_str, 1
 %endif
 %endif

   ; FIXME(rbultje) replace by jumptable?
   test          x_offsetd, x_offsetd
   jnz .x_nonzero
   ; x_offset == 0
   test          y_offsetd, y_offsetd
   jnz .x_zero_y_nonzero

   ; x_offset == 0 && y_offset == 0
 .x_zero_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
   mova                 m1, [dstq]
 %if %2 == 1 ; avg
   pavgb                m0, [secq]
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
 %endif
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5

 %if %2 == 0 ; !avg
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7

   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
 %if %2 == 1 ; avg
 %if %1 > 4
   movhps               m0, [srcq+src_strideq]
 %else ; 4xh
   movx                 m1, [srcq+src_strideq]
   punpckldq            m0, m1
 %endif
 %else ; !avg
   movx                 m2, [srcq+src_strideq]
 %endif

   movx                 m1, [dstq]
   movx                 m3, [dstq+dst_strideq]

 %if %2 == 1 ; avg
 %if %1 > 4
   pavgb                m0, [secq]
 %else
   movh                 m2, [secq]
   pavgb                m0, m2
 %endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
 %if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %else ; !avg
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
   punpcklbw            m1, m5
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7

   lea                srcq, [srcq+src_strideq*2]
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_zero_y_zero_loop
   STORE_AND_RET %1

 .x_zero_y_nonzero:
   cmp           y_offsetd, 4
   jne .x_zero_y_nonhalf

   ; x_offset == 0 && y_offset == 0.5
 .x_zero_y_half_loop:
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+src_strideq]
   mova                 m1, [dstq]
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
 %if %2 == 1 ; avg
   pavgb                m0, [secq]
 %endif
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7

   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m2, [srcq+src_strideq]
 %if %2 == 1 ; avg
 %if %1 > 4
   movhps               m2, [srcq+src_strideq*2]
 %else ; 4xh
   movx                 m1, [srcq+src_strideq*2]
   punpckldq            m2, m1
 %endif
   movx                 m1, [dstq]
 %if %1 > 4
   movlhps              m0, m2
 %else ; 4xh
   punpckldq            m0, m2
 %endif
   movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m2
   punpcklbw            m1, m5
 %if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m3, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
   movh                 m4, [secq]
   pavgb                m0, m4
   punpcklbw            m3, m5
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %else ; !avg
   movx                 m4, [srcq+src_strideq*2]
   movx                 m1, [dstq]
   pavgb                m0, m2
   movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
   punpcklbw            m1, m5
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7

   lea                srcq, [srcq+src_strideq*2]
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_zero_y_half_loop
   STORE_AND_RET %1

 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
 %ifdef PIC
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
   mova                m10, [pw_8]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
 %else ; x86-32 or mmx
 %if ARCH_X86=1 && CONFIG_PIC=1
 ; x_offset == 0, reuse x_offset reg
 %define tempq x_offsetq
   add y_offsetq, g_bilin_filterm
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
   mov tempq, g_pw_8m
 %define filter_rnd [tempq]
 %else
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
 %endif

 .x_zero_y_other_loop:
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+src_strideq]
   mova                 m1, [dstq]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
   pmaddubsw            m2, filter_y_a
   pmaddubsw            m0, filter_y_a
   paddw                m2, filter_rnd
   paddw                m0, filter_rnd
 %else
   punpckhbw            m2, m0, m5
   punpckhbw            m3, m4, m5
   punpcklbw            m0, m5
   punpcklbw            m4, m5
   ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
   ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
   ; instructions is the same (5), but it is 1 mul instead of 2, so might be
   ; slightly faster because of pmullw latency. It would also cut our rodata
   ; tables in half for this function, and save 1-2 registers on x86-64.
   pmullw               m2, filter_y_a
   pmullw               m3, filter_y_b
   paddw                m2, filter_rnd
   pmullw               m0, filter_y_a
   pmullw               m4, filter_y_b
   paddw                m0, filter_rnd
   paddw                m2, m3
   paddw                m0, m4
 %endif
   psraw                m2, 4
   psraw                m0, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7

   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m2, [srcq+src_strideq]
   movx                 m4, [srcq+src_strideq*2]
   movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
   movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
   pmaddubsw            m2, filter_y_a
   punpcklbw            m3, m5
   paddw                m2, filter_rnd
   paddw                m0, filter_rnd
 %else
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m4, m5
   pmullw               m0, filter_y_a
   pmullw               m1, m2, filter_y_b
   punpcklbw            m3, m5
   paddw                m0, filter_rnd
   pmullw               m2, filter_y_a
   pmullw               m4, filter_y_b
   paddw                m0, m1
   paddw                m2, filter_rnd
   movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
 %if %1 == 4
   movlhps              m0, m2
 %endif
   packuswb             m0, m2
 %if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
   movh                 m2, [secq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7

   lea                srcq, [srcq+src_strideq*2]
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_zero_y_other_loop
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
   STORE_AND_RET %1

 .x_nonzero:
   cmp           x_offsetd, 4
   jne .x_nonhalf
   ; x_offset == 0.5
   test          y_offsetd, y_offsetd
   jnz .x_half_y_nonzero

   ; x_offset == 0.5 && y_offset == 0
 .x_half_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+1]
   mova                 m1, [dstq]
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
 %if %2 == 1 ; avg
   pavgb                m0, [secq]
 %endif
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7

   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m4, [srcq+1]
 %if %2 == 1 ; avg
 %if %1 > 4
   movhps               m0, [srcq+src_strideq]
   movhps               m4, [srcq+src_strideq+1]
 %else ; 4xh
   movx                 m1, [srcq+src_strideq]
   punpckldq            m0, m1
   movx                 m2, [srcq+src_strideq+1]
   punpckldq            m4, m2
 %endif
   movx                 m1, [dstq]
   movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m4
   punpcklbw            m3, m5
 %if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
   movh                 m2, [secq]
   pavgb                m0, m2
   punpcklbw            m1, m5
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %else ; !avg
   movx                 m2, [srcq+src_strideq]
   movx                 m1, [dstq]
   pavgb                m0, m4
   movx                 m4, [srcq+src_strideq+1]
   movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
   punpcklbw            m1, m5
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7

   lea                srcq, [srcq+src_strideq*2]
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_half_y_zero_loop
   STORE_AND_RET %1

 .x_half_y_nonzero:
   cmp           y_offsetd, 4
   jne .x_half_y_nonhalf

   ; x_offset == 0.5 && y_offset == 0.5
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 .x_half_y_half_loop:
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
   mova                 m1, [dstq]
   pavgb                m4, m3
   punpckhbw            m3, m1, m5
   pavgb                m0, m4
 %if %2 == 1 ; avg
   punpcklbw            m1, m5
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
   punpcklbw            m1, m5
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4

   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 .x_half_y_half_loop:
   movx                 m2, [srcq]
   movx                 m3, [srcq+1]
 %if %2 == 1 ; avg
 %if %1 > 4
   movhps               m2, [srcq+src_strideq]
   movhps               m3, [srcq+src_strideq+1]
 %else
   movx                 m1, [srcq+src_strideq]
   punpckldq            m2, m1
   movx                 m1, [srcq+src_strideq+1]
   punpckldq            m3, m1
 %endif
   pavgb                m2, m3
 %if %1 > 4
   movlhps              m0, m2
   movhlps              m4, m2
 %else ; 4xh
   punpckldq            m0, m2
   pshuflw              m4, m2, 0xe
 %endif
   movx                 m1, [dstq]
   pavgb                m0, m2
   movx                 m3, [dstq+dst_strideq]
 %if %1 > 4
   pavgb                m0, [secq]
 %else
   movh                 m2, [secq]
   pavgb                m0, m2
 %endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
 %if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %else ; !avg
   movx                 m4, [srcq+src_strideq]
   movx                 m1, [srcq+src_strideq+1]
   pavgb                m2, m3
   pavgb                m4, m1
   pavgb                m0, m2
   pavgb                m2, m4
   movx                 m1, [dstq]
   movx                 m3, [dstq+dst_strideq]
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
   punpcklbw            m1, m5
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4

   lea                srcq, [srcq+src_strideq*2]
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_half_y_half_loop
   STORE_AND_RET %1

 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
 %ifdef PIC
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
   mova                m10, [pw_8]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
 %else  ;x86_32
 %if ARCH_X86=1 && CONFIG_PIC=1
 ; x_offset == 0.5. We can reuse x_offset reg
 %define tempq x_offsetq
   add y_offsetq, g_bilin_filterm
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
   mov tempq, g_pw_8m
 %define filter_rnd [tempq]
 %else
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
 %endif

 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 .x_half_y_other_loop:
   movu                 m4, [srcq]
   movu                 m2, [srcq+1]
   mova                 m1, [dstq]
   pavgb                m4, m2
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
   pmaddubsw            m2, filter_y_a
   pmaddubsw            m0, filter_y_a
   paddw                m2, filter_rnd
   paddw                m0, filter_rnd
   psraw                m2, 4
 %else
   punpckhbw            m2, m0, m5
   punpckhbw            m3, m4, m5
   pmullw               m2, filter_y_a
   pmullw               m3, filter_y_b
   paddw                m2, filter_rnd
   punpcklbw            m0, m5
   paddw                m2, m3
   punpcklbw            m3, m4, m5
   pmullw               m0, filter_y_a
   pmullw               m3, filter_y_b
   paddw                m0, filter_rnd
   psraw                m2, 4
   paddw                m0, m3
 %endif
   punpckhbw            m3, m1, m5
   psraw                m0, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4

   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 %if notcpuflag(ssse3)
   punpcklbw            m0, m5
 %endif
 .x_half_y_other_loop:
   movx                 m2, [srcq]
   movx                 m1, [srcq+1]
   movx                 m4, [srcq+src_strideq]
   movx                 m3, [srcq+src_strideq+1]
   pavgb                m2, m1
   pavgb                m4, m3
   movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
   movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
   pmaddubsw            m2, filter_y_a
   punpcklbw            m3, m5
   paddw                m0, filter_rnd
   paddw                m2, filter_rnd
 %else
   punpcklbw            m2, m5
   punpcklbw            m4, m5
   pmullw               m0, filter_y_a
   pmullw               m1, m2, filter_y_b
   punpcklbw            m3, m5
   paddw                m0, filter_rnd
   pmullw               m2, filter_y_a
   paddw                m0, m1
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m2, m1
   movx                 m1, [dstq]
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
 %if %1 == 4
   movlhps              m0, m2
 %endif
   packuswb             m0, m2
 %if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
   movh                 m2, [secq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4

   lea                srcq, [srcq+src_strideq*2]
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_half_y_other_loop
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
   STORE_AND_RET %1

 .x_nonhalf:
   test          y_offsetd, y_offsetd
   jnz .x_nonhalf_y_nonzero

   ; x_offset == bilin interpolation && y_offset == 0
 %ifdef PIC
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
   mova                m10, [pw_8]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
 %else    ; x86-32
 %if ARCH_X86=1 && CONFIG_PIC=1
 ;y_offset == 0. We can reuse y_offset reg.
 %define tempq y_offsetq
   add x_offsetq, g_bilin_filterm
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
   mov tempq, g_pw_8m
 %define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
 %endif

 .x_other_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+1]
   mova                 m1, [dstq]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m0, filter_x_a
   paddw                m2, filter_rnd
   paddw                m0, filter_rnd
 %else
   punpckhbw            m2, m0, m5
   punpckhbw            m3, m4, m5
   punpcklbw            m0, m5
   punpcklbw            m4, m5
   pmullw               m2, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m2, filter_rnd
   pmullw               m0, filter_x_a
   pmullw               m4, filter_x_b
   paddw                m0, filter_rnd
   paddw                m2, m3
   paddw                m0, m4
 %endif
   psraw                m2, 4
   psraw                m0, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7

   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
   movx                 m2, [srcq+src_strideq]
   movx                 m4, [srcq+src_strideq+1]
   movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   movx                 m1, [dstq]
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_x_a
   pmaddubsw            m2, filter_x_a
   punpcklbw            m3, m5
   paddw                m0, filter_rnd
   paddw                m2, filter_rnd
 %else
   punpcklbw            m0, m5
   punpcklbw            m1, m5
   punpcklbw            m2, m5
   punpcklbw            m4, m5
   pmullw               m0, filter_x_a
   pmullw               m1, filter_x_b
   punpcklbw            m3, m5
   paddw                m0, filter_rnd
   pmullw               m2, filter_x_a
   pmullw               m4, filter_x_b
   paddw                m0, m1
   paddw                m2, filter_rnd
   movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
 %if %1 == 4
   movlhps              m0, m2
 %endif
   packuswb             m0, m2
 %if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
   movh                 m2, [secq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7

   lea                srcq, [srcq+src_strideq*2]
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_other_y_zero_loop
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
   STORE_AND_RET %1

 .x_nonhalf_y_nonzero:
   cmp           y_offsetd, 4
   jne .x_nonhalf_y_nonhalf

   ; x_offset == bilin interpolation && y_offset == 0.5
 %ifdef PIC
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
   mova                m10, [pw_8]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
 %else    ; x86-32
 %if ARCH_X86=1 && CONFIG_PIC=1
 ; y_offset == 0.5. We can reuse y_offset reg.
 %define tempq y_offsetq
   add x_offsetq, g_bilin_filterm
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
   mov tempq, g_pw_8m
 %define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
 %endif

 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m1
   punpcklbw            m0, m1
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m0, filter_x_a
   paddw                m2, filter_rnd
   paddw                m0, filter_rnd
 %else
   punpckhbw            m2, m0, m5
   punpckhbw            m3, m1, m5
   punpcklbw            m0, m5
   punpcklbw            m1, m5
   pmullw               m0, filter_x_a
   pmullw               m1, filter_x_b
   paddw                m0, filter_rnd
   pmullw               m2, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m2, filter_rnd
   paddw                m0, m1
   paddw                m2, m3
 %endif
   psraw                m0, 4
   psraw                m2, 4
   add                srcq, src_strideq
   packuswb             m0, m2
 .x_other_y_half_loop:
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
 %if cpuflag(ssse3)
   mova                 m1, [dstq]
   punpckhbw            m2, m4, m3
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
   psraw                m4, 4
   packuswb             m4, m2
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
 %else
   punpckhbw            m2, m4, m5
   punpckhbw            m1, m3, m5
   punpcklbw            m4, m5
   punpcklbw            m3, m5
   pmullw               m4, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   pmullw               m2, filter_x_a
   pmullw               m1, filter_x_b
   paddw                m2, filter_rnd
   paddw                m4, m3
   paddw                m2, m1
   mova                 m1, [dstq]
   psraw                m4, 4
   psraw                m2, 4
   punpckhbw            m3, m1, m5
   ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
   ; have a 1-register shortage to be able to store the backup of the bilin
   ; filtered second line as words as cache for the next line. Packing into
   ; a byte costs 1 pack and 2 unpacks, but saves a register.
   packuswb             m4, m2
   punpcklbw            m1, m5
   pavgb                m0, m4
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   pavgb                m0, [secq]
 %endif
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4

   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
   paddw                m0, filter_rnd
 %else
   punpcklbw            m0, m5
   punpcklbw            m1, m5
   pmullw               m0, filter_x_a
   pmullw               m1, filter_x_b
   paddw                m0, filter_rnd
   paddw                m0, m1
 %endif
   add                srcq, src_strideq
   psraw                m0, 4
 .x_other_y_half_loop:
   movx                 m2, [srcq]
   movx                 m1, [srcq+1]
   movx                 m4, [srcq+src_strideq]
   movx                 m3, [srcq+src_strideq+1]
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
   movx                 m1, [dstq]
   movx                 m3, [dstq+dst_strideq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
 %else
   punpcklbw            m2, m5
   punpcklbw            m1, m5
   punpcklbw            m4, m5
   punpcklbw            m3, m5
   pmullw               m2, filter_x_a
   pmullw               m1, filter_x_b
   paddw                m2, filter_rnd
   pmullw               m4, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   paddw                m2, m1
   movx                 m1, [dstq]
   paddw                m4, m3
   movx                 m3, [dstq+dst_strideq]
 %endif
   psraw                m2, 4
   psraw                m4, 4
   pavgw                m0, m2
   pavgw                m2, m4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline - also consider going to bytes here
 %if %1 == 4
   movlhps              m0, m2
 %endif
   packuswb             m0, m2
 %if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
   movh                 m2, [secq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4

   lea                srcq, [srcq+src_strideq*2]
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_other_y_half_loop
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
   STORE_AND_RET %1

 .x_nonhalf_y_nonhalf:
 %ifdef PIC
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
   mova                m10, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                m11, [bilin_filter+y_offsetq+16]
 %endif
   mova                m12, [pw_8]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
 %define filter_y_b m11
 %define filter_rnd m12
 %else   ; x86-32
 %if ARCH_X86=1 && CONFIG_PIC=1
 ; In this case, there is NO unused register. Used src_stride register. Later,
 ; src_stride has to be loaded from stack when it is needed.
 %define tempq src_strideq
   mov tempq, g_bilin_filterm
   add           x_offsetq, tempq
   add           y_offsetq, tempq
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]

   mov tempq, g_pw_8m
 %define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
   add           y_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
 %endif

   ; x_offset == bilin interpolation && y_offset == bilin interpolation
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m1
   punpcklbw            m0, m1
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m0, filter_x_a
   paddw                m2, filter_rnd
   paddw                m0, filter_rnd
 %else
   punpckhbw            m2, m0, m5
   punpckhbw            m3, m1, m5
   punpcklbw            m0, m5
   punpcklbw            m1, m5
   pmullw               m0, filter_x_a
   pmullw               m1, filter_x_b
   paddw                m0, filter_rnd
   pmullw               m2, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m2, filter_rnd
   paddw                m0, m1
   paddw                m2, m3
 %endif
   psraw                m0, 4
   psraw                m2, 4

   INC_SRC_BY_SRC_STRIDE

   packuswb             m0, m2
 .x_other_y_other_loop:
 %if cpuflag(ssse3)
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
   mova                 m1, [dstq]
   punpckhbw            m2, m4, m3
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
   punpckhbw            m3, m1, m5
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
   psraw                m4, 4
   packuswb             m4, m2
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
   pmaddubsw            m2, filter_y_a
   pmaddubsw            m0, filter_y_a
   punpcklbw            m1, m5
   paddw                m2, filter_rnd
   paddw                m0, filter_rnd
   psraw                m2, 4
   psraw                m0, 4
 %else
   movu                 m3, [srcq]
   movu                 m4, [srcq+1]
   punpckhbw            m1, m3, m5
   punpckhbw            m2, m4, m5
   punpcklbw            m3, m5
   punpcklbw            m4, m5
   pmullw               m3, filter_x_a
   pmullw               m4, filter_x_b
   paddw                m3, filter_rnd
   pmullw               m1, filter_x_a
   pmullw               m2, filter_x_b
   paddw                m1, filter_rnd
   paddw                m3, m4
   paddw                m1, m2
   psraw                m3, 4
   psraw                m1, 4
   packuswb             m4, m3, m1
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
   pmullw               m2, filter_y_a
   pmullw               m1, filter_y_b
   paddw                m2, filter_rnd
   pmullw               m0, filter_y_a
   pmullw               m3, filter_y_b
   paddw                m2, m1
   mova                 m1, [dstq]
   paddw                m0, filter_rnd
   psraw                m2, 4
   paddw                m0, m3
   punpckhbw            m3, m1, m5
   psraw                m0, 4
   punpcklbw            m1, m5
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4

   INC_SRC_BY_SRC_STRIDE
   add                dstq, dst_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
   paddw                m0, filter_rnd
 %else
   punpcklbw            m0, m5
   punpcklbw            m1, m5
   pmullw               m0, filter_x_a
   pmullw               m1, filter_x_b
   paddw                m0, filter_rnd
   paddw                m0, m1
 %endif
   psraw                m0, 4
 %if cpuflag(ssse3)
   packuswb             m0, m0
 %endif

   INC_SRC_BY_SRC_STRIDE

 .x_other_y_other_loop:
   movx                 m2, [srcq]
   movx                 m1, [srcq+1]

   INC_SRC_BY_SRC_STRIDE
   movx                 m4, [srcq]
   movx                 m3, [srcq+1]

 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
   movx                 m3, [dstq+dst_strideq]
   movx                 m1, [dstq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
   psraw                m4, 4
   packuswb             m2, m2
   packuswb             m4, m4
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
   pmaddubsw            m2, filter_y_a
   punpcklbw            m3, m5
   paddw                m0, filter_rnd
   paddw                m2, filter_rnd
   psraw                m0, 4
   psraw                m2, 4
   punpcklbw            m1, m5
 %else
   punpcklbw            m2, m5
   punpcklbw            m1, m5
   punpcklbw            m4, m5
   punpcklbw            m3, m5
   pmullw               m2, filter_x_a
   pmullw               m1, filter_x_b
   paddw                m2, filter_rnd
   pmullw               m4, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   paddw                m2, m1
   paddw                m4, m3
   psraw                m2, 4
   psraw                m4, 4
   pmullw               m0, filter_y_a
   pmullw               m3, m2, filter_y_b
   paddw                m0, filter_rnd
   pmullw               m2, filter_y_a
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m0, m3
   movx                 m3, [dstq+dst_strideq]
   paddw                m2, m1
   movx                 m1, [dstq]
   psraw                m0, 4
   psraw                m2, 4
   punpcklbw            m3, m5
   punpcklbw            m1, m5
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
 %if %1 == 4
   movlhps              m0, m2
 %endif
   packuswb             m0, m2
 %if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
   movh                 m2, [secq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
 %endif
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4

   INC_SRC_BY_SRC_STRIDE
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
   dec                   block_height
   jg .x_other_y_other_loop
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
 %undef movx
   STORE_AND_RET %1
 %endmacro

 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
 ; between the ssse3 and non-ssse3 version. It may make sense to merge their
 ; code in the sense that the ssse3 version would jump to the appropriate
 ; location in the sse/2 version, rather than duplicating that code in the
 ; binary.

 INIT_XMM sse2
 SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16

 INIT_XMM ssse3
 SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16

 INIT_XMM sse2
 SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1

 INIT_XMM ssse3
 SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1