Correct scaling of sad in sad_skip functions for hbd encode

Bit-depth based scaling is added for sad_skip functions by
introducing relevant wrapper functions to down-convert the
sad to 8-bit domain.

Results on 10-bit encode show speed improvement for speed
levels 0 to 6.

          Instruction count          BD-Rate Impact(%)
cpu-used    Reduction(%)      avg.psnr   ovr.psnr   ssim
     0        2.47             -0.0107   -0.0022    -0.0154
     1        3.11             -0.0131   -0.0119    -0.0189
     2        2.90             -0.0023   -0.0076    -0.0073
     3        3.02              0.0358    0.0246     0.0658
     4        3.27              0.0425    0.0289     0.0693
     5        4.29              0.0555    0.0501     0.0958
     6        4.57              0.2623    0.283      0.3235

STATS_CHANGED for hbd encoding

Change-Id: Idd3906bf5699c6ed3baa5569960fb81f6e461c66
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index b8aea1d..2154fa9 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1323,10 +1323,7 @@
   SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d);
   SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d);
   SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d);
-  SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d);
   SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d);
-  SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d);
-  SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d);
 #undef SDSFP
 
 #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 40e7c08..37b200f 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -328,15 +328,6 @@
               aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \
               aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT)
 
-#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
-  cpi->fn_ptr[BT].sdsf = SDSF;          \
-  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
-
-#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT)            \
-  HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT,               \
-               aom_highbd_sad_skip_##WIDTH##x##HEIGHT, \
-               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d)
-
 #define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
   static unsigned int fnname##_bits8(                                    \
       const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
@@ -387,6 +378,92 @@
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
 #endif
 
+#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdsf = SDSF;          \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
+  HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT,                          \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD)
+
+#define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname)                                  \
+  static unsigned int fnname##_bits8(const uint8_t *src, int src_stride,    \
+                                     const uint8_t *ref, int ref_stride) {  \
+    return fnname(src, src_stride, ref, ref_stride);                        \
+  }                                                                         \
+  static unsigned int fnname##_bits10(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride) { \
+    return fnname(src, src_stride, ref, ref_stride) >> 2;                   \
+  }                                                                         \
+  static unsigned int fnname##_bits12(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride) { \
+    return fnname(src, src_stride, ref, ref_stride) >> 4;                   \
+  }
+
+#define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname)                                 \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32)
+
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d)
+#endif
+
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
   cpi->fn_ptr[BT].osdf = OSDF;           \
   cpi->fn_ptr[BT].ovf = OVF;             \
@@ -518,6 +595,26 @@
         LOWBD_OBFP_WRAPPER(8, 32)
         LOWBD_OBFP_WRAPPER(16, 4)
         LOWBD_OBFP_WRAPPER(4, 16)
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 8);
+        HIGHBD_SDSFP_WRAPPER(128, 64, 8);
+        HIGHBD_SDSFP_WRAPPER(64, 128, 8);
+        HIGHBD_SDSFP_WRAPPER(64, 64, 8);
+        HIGHBD_SDSFP_WRAPPER(64, 32, 8);
+        HIGHBD_SDSFP_WRAPPER(64, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(32, 64, 8);
+        HIGHBD_SDSFP_WRAPPER(32, 32, 8);
+        HIGHBD_SDSFP_WRAPPER(32, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(32, 8, 8);
+        HIGHBD_SDSFP_WRAPPER(16, 64, 8);
+        HIGHBD_SDSFP_WRAPPER(16, 32, 8);
+        HIGHBD_SDSFP_WRAPPER(16, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(16, 8, 8);
+        HIGHBD_SDSFP_WRAPPER(8, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(8, 8, 8);
+        HIGHBD_SDSFP_WRAPPER(4, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(4, 8, 8);
+        HIGHBD_SDSFP_WRAPPER(8, 32, 8);
         break;
 
       case AOM_BITS_10:
@@ -589,6 +686,26 @@
         HIGHBD_OBFP_WRAPPER(8, 32, 10)
         HIGHBD_OBFP_WRAPPER(16, 4, 10)
         HIGHBD_OBFP_WRAPPER(4, 16, 10)
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 10);
+        HIGHBD_SDSFP_WRAPPER(128, 64, 10);
+        HIGHBD_SDSFP_WRAPPER(64, 128, 10);
+        HIGHBD_SDSFP_WRAPPER(64, 64, 10);
+        HIGHBD_SDSFP_WRAPPER(64, 32, 10);
+        HIGHBD_SDSFP_WRAPPER(64, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(32, 64, 10);
+        HIGHBD_SDSFP_WRAPPER(32, 32, 10);
+        HIGHBD_SDSFP_WRAPPER(32, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(32, 8, 10);
+        HIGHBD_SDSFP_WRAPPER(16, 64, 10);
+        HIGHBD_SDSFP_WRAPPER(16, 32, 10);
+        HIGHBD_SDSFP_WRAPPER(16, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(16, 8, 10);
+        HIGHBD_SDSFP_WRAPPER(8, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(8, 8, 10);
+        HIGHBD_SDSFP_WRAPPER(4, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(4, 8, 10);
+        HIGHBD_SDSFP_WRAPPER(8, 32, 10);
         break;
 
       case AOM_BITS_12:
@@ -660,6 +777,26 @@
         HIGHBD_OBFP_WRAPPER(8, 32, 12)
         HIGHBD_OBFP_WRAPPER(16, 4, 12)
         HIGHBD_OBFP_WRAPPER(4, 16, 12)
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 12);
+        HIGHBD_SDSFP_WRAPPER(128, 64, 12);
+        HIGHBD_SDSFP_WRAPPER(64, 128, 12);
+        HIGHBD_SDSFP_WRAPPER(64, 64, 12);
+        HIGHBD_SDSFP_WRAPPER(64, 32, 12);
+        HIGHBD_SDSFP_WRAPPER(64, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(32, 64, 12);
+        HIGHBD_SDSFP_WRAPPER(32, 32, 12);
+        HIGHBD_SDSFP_WRAPPER(32, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(32, 8, 12);
+        HIGHBD_SDSFP_WRAPPER(16, 64, 12);
+        HIGHBD_SDSFP_WRAPPER(16, 32, 12);
+        HIGHBD_SDSFP_WRAPPER(16, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(16, 8, 12);
+        HIGHBD_SDSFP_WRAPPER(8, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(8, 8, 12);
+        HIGHBD_SDSFP_WRAPPER(4, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(4, 8, 12);
+        HIGHBD_SDSFP_WRAPPER(8, 32, 12);
         break;
 
       default:
@@ -667,29 +804,6 @@
                "cm->seq_params.bit_depth should be AOM_BITS_8, "
                "AOM_BITS_10 or AOM_BITS_12");
     }
-
-    HIGHBD_SDSFP_WRAPPER(128, 128);
-    HIGHBD_SDSFP_WRAPPER(128, 64);
-    HIGHBD_SDSFP_WRAPPER(64, 128);
-    HIGHBD_SDSFP_WRAPPER(64, 64);
-    HIGHBD_SDSFP_WRAPPER(64, 32);
-    HIGHBD_SDSFP_WRAPPER(64, 16);
-    HIGHBD_SDSFP_WRAPPER(32, 64);
-    HIGHBD_SDSFP_WRAPPER(32, 32);
-    HIGHBD_SDSFP_WRAPPER(32, 16);
-    HIGHBD_SDSFP_WRAPPER(32, 8);
-    HIGHBD_SDSFP_WRAPPER(16, 64);
-    HIGHBD_SDSFP_WRAPPER(16, 32);
-    HIGHBD_SDSFP_WRAPPER(16, 16);
-    HIGHBD_SDSFP_WRAPPER(16, 8);
-    HIGHBD_SDSFP_WRAPPER(8, 16);
-    HIGHBD_SDSFP_WRAPPER(8, 8);
-    HIGHBD_SDSFP_WRAPPER(4, 16);
-    HIGHBD_SDSFP_WRAPPER(4, 8);
-    HIGHBD_SDSFP_WRAPPER(4, 16);
-    HIGHBD_SDSFP_WRAPPER(8, 32);
-    HIGHBD_SDSFP_WRAPPER(32, 8);
-    HIGHBD_SDSFP_WRAPPER(64, 16);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH