Optimize and cleanup supertx predictor.

Use vpx_blend_a64_hmask and vpx_blend_a64_vmask to speed up
computing the supertx predictor.

Decoder speedup of up to 4% has been observed.

Change-Id: I255a5ba4cc24f78dc905d25b6e2f7fbafac13253
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 3a196a5..53fd1a6 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -1067,177 +1067,123 @@
   28, 18, 10,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
 };
 
-static void generate_1dmask(int length, uint8_t *mask, int plane) {
+static const uint8_t* get_supertx_mask(int length, int plane) {
   switch (length) {
     case 8:
-      memcpy(mask, plane ? mask_8_uv : mask_8, length);
-      break;
+      return plane ? mask_8_uv : mask_8;
     case 16:
-      memcpy(mask, plane ? mask_16_uv : mask_16, length);
-      break;
+      return plane ? mask_16_uv : mask_16;
     case 32:
-      memcpy(mask, plane ? mask_32_uv : mask_32, length);
-      break;
+      return plane ? mask_32_uv : mask_32;
     default:
       assert(0);
   }
+  return NULL;
 }
 
 void vp10_build_masked_inter_predictor_complex(
     MACROBLOCKD *xd,
-    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    uint8_t *dst, int dst_stride,
+    const uint8_t *pre, int pre_stride,
     int mi_row, int mi_col,
     int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
     PARTITION_TYPE partition, int plane) {
-  int i, j;
   const struct macroblockd_plane *pd = &xd->plane[plane];
-  uint8_t mask[MAX_TX_SIZE];
-  int top_w = 4 << b_width_log2_lookup[top_bsize];
-  int top_h = 4 << b_height_log2_lookup[top_bsize];
-  int w = 4 << b_width_log2_lookup[bsize];
-  int h = 4 << b_height_log2_lookup[bsize];
-  int w_offset = (mi_col - mi_col_ori) * MI_SIZE;
-  int h_offset = (mi_row - mi_row_ori) * MI_SIZE;
+  const int ssx = pd->subsampling_x;
+  const int ssy = pd->subsampling_y;
+  const int top_w = (4 << b_width_log2_lookup[top_bsize]) >> ssx;
+  const int top_h = (4 << b_height_log2_lookup[top_bsize]) >> ssy;
+  const int w = (4 << b_width_log2_lookup[bsize]) >> ssx;
+  const int h = (4 << b_height_log2_lookup[bsize]) >> ssy;
+  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
+  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
+
+  int w_remain, h_remain;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  uint16_t *dst16= CONVERT_TO_SHORTPTR(dst);
-  uint16_t *dst216 = CONVERT_TO_SHORTPTR(dst2);
-  int b_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   assert(bsize <= BLOCK_32X32);
-
-  top_w >>= pd->subsampling_x;
-  top_h >>= pd->subsampling_y;
-  w >>= pd->subsampling_x;
-  h >>= pd->subsampling_y;
-  w_offset >>= pd->subsampling_x;
-  h_offset >>= pd->subsampling_y;
+  assert(IMPLIES(plane == 0, ssx == 0));
+  assert(IMPLIES(plane == 0, ssy == 0));
 
   switch (partition) {
-    case PARTITION_HORZ:
-    {
+    case PARTITION_HORZ: {
+      const uint8_t *const mask = get_supertx_mask(h, ssy);
+
+      w_remain = top_w;
+      h_remain = top_h - h_offset - h;
+      dst += h_offset * dst_stride;
+      pre += h_offset * pre_stride;
+
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (b_hdb) {
-        uint16_t *dst_tmp = dst16 + h_offset * dst_stride;
-        uint16_t *dst2_tmp = dst216 + h_offset * dst2_stride;
-        generate_1dmask(h, mask + h_offset,
-                        plane && xd->plane[plane].subsampling_y);
-
-        for (i = h_offset; i < h_offset + h; i++) {
-          for (j = 0; j < top_w; j++) {
-            const int m = mask[i];  assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-
-        for (; i < top_h; i ++) {
-          memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint16_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-      } else {
+      if (is_hdb)
+        vpx_highbd_blend_a64_vmask(dst, dst_stride,
+                                   dst, dst_stride,
+                                   pre, pre_stride,
+                                   mask, h, top_w, xd->bd);
+      else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        uint8_t *dst_tmp = dst + h_offset * dst_stride;
-        uint8_t *dst2_tmp = dst2 + h_offset * dst2_stride;
-        generate_1dmask(h, mask + h_offset,
-                        plane && xd->plane[plane].subsampling_y);
+        vpx_blend_a64_vmask(dst, dst_stride,
+                            dst, dst_stride,
+                            pre, pre_stride,
+                            mask, h, top_w);
 
-        for (i = h_offset; i < h_offset + h; i++) {
-          for (j = 0; j < top_w; j++) {
-            const int m = mask[i];  assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-
-        for (; i < top_h; i ++) {
-          memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint8_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
-
+      dst += h * dst_stride;
+      pre += h * pre_stride;
       break;
-    case PARTITION_VERT:
-    {
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (b_hdb) {
-        uint16_t *dst_tmp = dst16;
-        uint16_t *dst2_tmp = dst216;
-        generate_1dmask(w, mask + w_offset,
-                        plane && xd->plane[plane].subsampling_x);
-
-        for (i = 0; i < top_h; i++) {
-          for (j = w_offset; j < w_offset + w; j++) {
-            const int m = mask[j];   assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          memcpy(dst_tmp + j, dst2_tmp + j,
-                     (top_w - w_offset - w) * sizeof(uint16_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-      } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        uint8_t *dst_tmp = dst;
-        uint8_t *dst2_tmp = dst2;
-        generate_1dmask(w, mask + w_offset,
-                        plane && xd->plane[plane].subsampling_x);
-
-        for (i = 0; i < top_h; i++) {
-          for (j = w_offset; j < w_offset + w; j++) {
-            const int m = mask[j];   assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-            memcpy(dst_tmp + j, dst2_tmp + j,
-                       (top_w - w_offset - w) * sizeof(uint8_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
+    case PARTITION_VERT: {
+      const uint8_t *const mask = get_supertx_mask(w, ssx);
+
+      w_remain = top_w - w_offset - w;
+      h_remain = top_h;
+      dst += w_offset;
+      pre += w_offset;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (is_hdb)
+        vpx_highbd_blend_a64_hmask(dst, dst_stride,
+                                   dst, dst_stride,
+                                   pre, pre_stride,
+                                   mask, top_h, w, xd->bd);
+      else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        vpx_blend_a64_hmask(dst, dst_stride,
+                            dst, dst_stride,
+                            pre, pre_stride,
+                            mask, top_h, w);
+
+      dst += w;
+      pre += w;
       break;
-    default:
+    }
+    default: {
       assert(0);
+      return;
+    }
   }
-  (void) xd;
+
+  if (w_remain == 0 || h_remain == 0) {
+    return;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (is_hdb) {
+    dst = (uint8_t*)CONVERT_TO_SHORTPTR(dst);
+    pre = (const uint8_t*)CONVERT_TO_SHORTPTR(pre);
+    dst_stride *= 2;
+    pre_stride *= 2;
+    w_remain *= 2;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  do {
+    memcpy(dst, pre, w_remain * sizeof(uint8_t));
+    dst += dst_stride;
+    pre += pre_stride;
+  } while (--h_remain);
 }
 
 void vp10_build_inter_predictors_sb_sub8x8_extend(
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 7321831..ac4a004 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -396,7 +396,8 @@
 struct macroblockd_plane;
 void vp10_build_masked_inter_predictor_complex(
     MACROBLOCKD *xd,
-    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    uint8_t *dst, int dst_stride,
+    const uint8_t *pre, int pre_stride,
     int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
     BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
     PARTITION_TYPE partition, int plane);