Merge "Code clean of highbd_tm_predictor_4x4"
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 66a23cb..2bebdcb 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -141,13 +141,13 @@
                                        &vpx_highbd_tm_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 8),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -162,13 +162,13 @@
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 8),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -194,14 +194,14 @@
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 10),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -218,14 +218,14 @@
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 10),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -251,14 +251,14 @@
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 12),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 12),
@@ -275,14 +275,14 @@
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 12),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 12),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 04c4b73..a2a0674 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -288,13 +288,13 @@
   specialize qw/vpx_highbd_d153_predictor_4x4/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_4x4/;
diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm
index 712ab49..233958a 100644
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -17,24 +17,20 @@
 pw_32: times 4 dd 32
 
 SECTION .text
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   movq                  m0, [aboveq]
   movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, one
-  mov                 oned, 0x0001
-  pxor                  m1, m1
-  movd                  m3, oned
-  pshufw                m3, m3, 0x0
   paddw                 m0, m2
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
   paddw                 m0, [GLOBAL(pw_4)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq*2], m0
   lea                 dstq, [dstq+strideq*4]
@@ -183,7 +179,7 @@
   REP_RET
 %endif
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
   movq                  m0, [aboveq]
   movq    [dstq          ], m0
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 22d52a2..30ee81b 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -79,20 +79,13 @@
 
 %macro INC_SRC_BY_SRC_STRIDE  0
 %if ARCH_X86=1 && CONFIG_PIC=1
-  lea                srcq, [srcq + src_stridemp*2]
+  add                srcq, src_stridemp
+  add                srcq, src_stridemp
 %else
   lea                srcq, [srcq + src_strideq*2]
 %endif
 %endmacro
 
-%macro INC_SRC_BY_SRC_2STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  lea                srcq, [srcq + src_stridemp*4]
-%else
-  lea                srcq, [srcq + src_strideq*4]
-%endif
-%endmacro
-
 %macro SUBPEL_VARIANCE 1-2 0 ; W
 %define bilin_filter_m bilin_filter_m_sse2
 %define filter_idx_shift 5
@@ -984,8 +977,9 @@
 .x_other_y_other_loop:
   movu                 m2, [srcq]
   movu                 m4, [srcq+2]
-  movu                 m3, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*2+2]
+  INC_SRC_BY_SRC_STRIDE
+  movu                 m3, [srcq]
+  movu                 m5, [srcq+2]
   pmullw               m2, filter_x_a
   pmullw               m4, filter_x_b
   paddw                m2, filter_rnd
@@ -1018,7 +1012,7 @@
   SUM_SSE              m0, m2, m4, m3, m6, m7
   mova                 m0, m5
 
-  INC_SRC_BY_SRC_2STRIDE
+  INC_SRC_BY_SRC_STRIDE
   lea                dstq, [dstq + dst_strideq * 4]
 %if %2 == 1 ; avg
   add                secq, sec_str
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index b45331c..81ec5db 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -243,13 +243,18 @@
 }
 
 #if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
 #define DECL(w, opt) \
   int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
                                                  ptrdiff_t src_stride, \
                                                  int x_offset, int y_offset, \
                                                  const uint16_t *dst, \
                                                  ptrdiff_t dst_stride, \
-                                                 int height, unsigned int *sse);
+                                                 int height, \
+                                                 unsigned int *sse, \
+                                                 void *unused0, void *unused);
 #define DECLS(opt1, opt2) \
   DECL(8, opt1); \
   DECL(16, opt1)
@@ -274,7 +279,7 @@
   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst, dst_stride, h, \
-                                                       &sse); \
+                                                       &sse, NULL, NULL); \
   if (w > wf) { \
     unsigned int sse2; \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -282,19 +287,20 @@
                                                           x_offset, y_offset, \
                                                           dst + 16, \
                                                           dst_stride, \
-                                                          h, &sse2); \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 32, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
           src + 48, src_stride, x_offset, y_offset, \
-          dst + 48, dst_stride, h, &sse2); \
+          dst + 48, dst_stride, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -312,7 +318,7 @@
   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst, dst_stride, \
-                                                       h, &sse); \
+                                                       h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -320,20 +326,21 @@
                                                           x_offset, y_offset, \
                                                           dst + 16, \
                                                           dst_stride, \
-                                                          h, &sse2); \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 32, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 48, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -359,27 +366,27 @@
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
         src + (start_row * src_stride), src_stride, \
         x_offset, y_offset, dst + (start_row * dst_stride), \
-        dst_stride, height, &sse2); \
+        dst_stride, height, &sse2, NULL, NULL); \
     se += se2; \
     long_sse += sse2; \
     if (w > wf) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
           src + 16 + (start_row * src_stride), src_stride, \
           x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
-          dst_stride, height, &sse2); \
+          dst_stride, height, &sse2, NULL, NULL); \
       se += se2; \
       long_sse += sse2; \
       if (w > wf * 2) { \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
             src + 32 + (start_row * src_stride), src_stride, \
             x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
+            dst_stride, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
             src + 48 + (start_row * src_stride), src_stride, \
             x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
+            dst_stride, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
       }\
@@ -410,6 +417,7 @@
 #undef FNS
 #undef FN
 
+// The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
                                                    ptrdiff_t src_stride, \
@@ -419,7 +427,8 @@
                                                    const uint16_t *sec, \
                                                    ptrdiff_t sec_stride, \
                                                    int height, \
-                                                   unsigned int *sse);
+                                                   unsigned int *sse, \
+                                                   void *unused0, void *unused);
 #define DECLS(opt1) \
 DECL(16, opt1) \
 DECL(8, opt1)
@@ -439,23 +448,23 @@
   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                src, src_stride, x_offset, \
-               y_offset, dst, dst_stride, sec, w, h, &sse); \
+               y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                   src + 16, src_stride, x_offset, y_offset, \
-                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 32, src_stride, x_offset, y_offset, \
-                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 48, src_stride, x_offset, y_offset, \
-                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -475,14 +484,15 @@
   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src, src_stride, x_offset, \
                                             y_offset, dst, dst_stride, \
-                                            sec, w, h, &sse); \
+                                            sec, w, h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src + 16, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 16, dst_stride, \
-                                            sec + 16, w, h, &sse2); \
+                                            sec + 16, w, h, &sse2, \
+                                            NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
@@ -490,14 +500,16 @@
                                             src + 32, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 32, dst_stride, \
-                                            sec + 32, w, h, &sse2); \
+                                            sec + 32, w, h, &sse2, \
+                                            NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src + 48, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 48, dst_stride, \
-                                            sec + 48, w, h, &sse2); \
+                                            sec + 48, w, h, &sse2, \
+                                            NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -525,7 +537,7 @@
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + (start_row * src_stride), src_stride, x_offset, \
                 y_offset, dst + (start_row * dst_stride), dst_stride, \
-                sec + (start_row * w), w, height, &sse2); \
+                sec + (start_row * w), w, height, &sse2, NULL, NULL); \
     se += se2; \
     long_sse += sse2; \
     if (w > wf) { \
@@ -533,7 +545,7 @@
                 src + 16 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 16 + (start_row * dst_stride), dst_stride, \
-                sec + 16 + (start_row * w), w, height, &sse2); \
+                sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
       se += se2; \
       long_sse += sse2; \
       if (w > wf * 2) { \
@@ -541,14 +553,14 @@
                 src + 32 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 32 + (start_row * dst_stride), dst_stride, \
-                sec + 32 + (start_row * w), w, height, &sse2); \
+                sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 48 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 48 + (start_row * dst_stride), dst_stride, \
-                sec + 48 + (start_row * w), w, height, &sse2); \
+                sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
       } \
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index a141999..1ec906c 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -17,7 +17,7 @@
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
 %else ; avg
@@ -25,7 +25,7 @@
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -244,9 +244,9 @@
   movd                  m2, [srcq]
   movd                  m5, [srcq+src_strideq]
   movd                  m4, [srcq+src_strideq*2]
-  movd                  m6, [srcq+src_stride3q]
+  movd                  m3, [srcq+src_stride3q]
   punpckldq             m2, m5
-  punpckldq             m4, m6
+  punpckldq             m4, m3
   movlhps               m2, m4
   psadbw                m1, m2
   lea                 refq, [refq+ref_strideq*4]