ARBITRARY_WEDGE: save raw mask instead of smoothed

Also:
- For RLE, compute rate in bits * 512 also
- Bitstream writing is partial: call to rle only, but actually write to
bitstream is pending
- use av1_cost_literal
- Build fix with dump masks

Change-Id: I46efdf9f22f781c4eb5e43bd2dfaebdab018cc80
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 7428fc2..5d6af0b 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -206,6 +206,9 @@
 // sent together in functions related to interinter compound modes
 typedef struct {
   uint8_t *seg_mask;
+#if CONFIG_ARBITRARY_WEDGE
+  uint8_t *seg_mask_smoothed;
+#endif  // CONFIG_ARBITRARY_WEDGE
   int8_t wedge_index;
   int8_t wedge_sign;
   DIFFWTD_MASK_TYPE mask_type;
@@ -831,6 +834,13 @@
    */
   DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
 
+#if CONFIG_ARBITRARY_WEDGE
+  // Only used for arbitrary wedge. Derived from 'seg_mask' by extending binary
+  // mask range, and then smoothing to get a contiguous soft mask.
+  // TODO(urvang): Does size need to be 2 times?
+  DECLARE_ALIGNED(16, uint8_t, seg_mask_smoothed[2 * MAX_SB_SQUARE]);
+#endif  // CONFIG_ARBITRARY_WEDGE
+
   /*!
    * CFL (chroma from luma) related parameters.
    */
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 93b737d..5a00c63 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -26,6 +26,10 @@
 #include "av1/common/obmc.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
+// TODO(urvang): Move common parts to av1/common.
+#if CONFIG_ARBITRARY_WEDGE
+#include "av1/encoder/segment_patch.h"
+#endif  // CONFIG_ARBITRARY_WEDGE
 
 // This function will determine whether or not to create a warped
 // prediction.
@@ -330,8 +334,23 @@
     case COMPOUND_WEDGE:
 #if CONFIG_ARBITRARY_WEDGE
       if (av1_wedge_params_lookup[sb_type].codebook == NULL) {
-        // We are using an arbitrary mask, stored earlier.
-        return comp_data->seg_mask;
+        // We are using an arbitrary mask, stored earlier. But we need to derive
+        // the smooth mask from raw binary mask.
+        const int bw = block_size_wide[sb_type];
+        const int bh = block_size_high[sb_type];
+        const int N = bw * bh;
+
+        memcpy(comp_data->seg_mask_smoothed, comp_data->seg_mask,
+               N * sizeof(*comp_data->seg_mask_smoothed));
+
+        // TODO(urvang): Refactor part below.
+
+        // Convert binary mask with values {0, 1} to one with values {0, 64}.
+        av1_extend_binary_mask_range(comp_data->seg_mask_smoothed, bw, bh);
+
+        // Get a smooth mask from the binary mask.
+        av1_apply_box_blur(comp_data->seg_mask_smoothed, bw, bh);
+        return comp_data->seg_mask_smoothed;
       }
 #endif  // CONFIG_ARBITRARY_WEDGE
       return av1_get_contiguous_soft_mask(comp_data->wedge_index,
@@ -951,6 +970,9 @@
       }
       // Assign physical buffer.
       inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
+#if CONFIG_ARBITRARY_WEDGE
+      inter_pred_params.mask_comp.seg_mask_smoothed = xd->seg_mask_smoothed;
+#endif  // CONFIG_ARBITRARY_WEDGE
     }
 
     av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index f988e81..7f292bc 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -43,6 +43,9 @@
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/palette.h"
+#if CONFIG_ARBITRARY_WEDGE
+#include "av1/encoder/segment_patch.h"
+#endif  // CONFIG_ARBITRARY_WEDGE
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
 
@@ -1235,9 +1238,28 @@
 
         if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
           assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-          aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
-                           ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
-          aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+          // TODO(urvang): Same on decoder side also.
+#if CONFIG_ARBITRARY_WEDGE
+          if (av1_wedge_params_lookup[bsize].codebook == NULL) {
+            // Arbirary wedge: run RLE and write the RLE output.
+            uint8_t rle_buf[3 * MAX_SB_SQUARE];
+            int rle_size = 0;
+            int rle_rate = 0;
+            av1_run_length_encode(mbmi->interinter_comp.seg_mask,
+                                  block_size_wide[bsize],
+                                  block_size_high[bsize],
+                                  block_size_wide[bsize], rle_buf, &rle_size,
+                                  &rle_rate);
+            // TODO(urvang): Write rle_buf.
+          } else {
+            // Transmit wedge index and sign only.
+#endif  // CONFIG_ARBITRARY_WEDGE
+            aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+                             ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+            aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+#if CONFIG_ARBITRARY_WEDGE
+          }
+#endif  // CONFIG_ARBITRARY_WEDGE
         } else {
           assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
           aom_write_literal(w, mbmi->interinter_comp.mask_type,
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 76eb4e1..470d377 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -306,11 +306,10 @@
 #if CONFIG_ARBITRARY_WEDGE
 // Create an arbitrary binary mask using spacial segmentation of this block.
 // This is used for larger blocks, where we don't have pre-defined wedges.
-static int64_t pick_arbitrary_wedge(const AV1_COMP *const cpi,
-                                    MACROBLOCK *const x, const BLOCK_SIZE bsize,
-                                    const int16_t *const residual1,
-                                    const int16_t *const diff10,
-                                    uint8_t *seg_mask, uint64_t *best_sse) {
+static int64_t pick_arbitrary_wedge(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+    const int16_t *const residual1, const int16_t *const diff10,
+    uint8_t *seg_mask, uint8_t *seg_mask_smoothed, uint64_t *best_sse) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int N = bw * bh;
@@ -327,38 +326,42 @@
   params.k = 5000;  // TODO(urvang): Temporary hack to get 2 components.
   int num_components = -1;
   av1_get_segments(x->plane[0].src.buf, bw, bh, x->plane[0].src.stride, &params,
-                   seg_mask, &num_components);
+                   seg_mask_smoothed, &num_components);
 
   if (num_components >= 2) {
     // TODO(urvang): Convert more than 2 components to 2 components.
     if (num_components == 2) {
+      // Save the raw mask to 'seg_mask', as that is the one to be used for
+      // signaling in the bitstream.
+      memcpy(seg_mask, seg_mask_smoothed, N * sizeof(*seg_mask));
+
+      // TODO(urvang): Refactor part below.
+
       // Convert binary mask with values {0, 1} to one with values {0, 64}.
-      av1_extend_binary_mask_range(seg_mask, bw, bh);
+      av1_extend_binary_mask_range(seg_mask_smoothed, bw, bh);
 #if DUMP_SEGMENT_MASKS
-      av1_dump_raw_y_plane(seg_mask, bw, bh, bw, "/tmp/2.binary_mask.yuv");
+      av1_dump_raw_y_plane(seg_mask_smoothed, bw, bh, bw, "/tmp/2.binary_mask.yuv");
 #endif  // DUMP_SEGMENT_MASKS
 
       // Get a smooth mask from the binary mask.
-      av1_apply_box_blur(seg_mask, bw, bh);
+      av1_apply_box_blur(seg_mask_smoothed, bw, bh);
 #if DUMP_SEGMENT_MASKS
-      av1_dump_raw_y_plane(seg_mask, bw, bh, bw, "/tmp/3.smooth_mask.yuv");
+      av1_dump_raw_y_plane(seg_mask_smoothed, bw, bh, bw, "/tmp/3.smooth_mask.yuv");
 #endif  // DUMP_SEGMENT_MASKS
 
       // Get RDCost
       *best_sse =
-          av1_wedge_sse_from_residuals(residual1, diff10, seg_mask, N);
+          av1_wedge_sse_from_residuals(residual1, diff10, seg_mask_smoothed, N);
       const MACROBLOCKD *const xd = &x->e_mbd;
       const int hbd = is_cur_buf_hbd(xd);
       const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-      *best_sse =  ROUND_POWER_OF_TWO(*best_sse, bd_round);
+      *best_sse = ROUND_POWER_OF_TWO(*best_sse, bd_round);
 
       int rate;
       int64_t dist;
-      model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, *best_sse, N,
-                                                    &rate, &dist);
-      // TODO(urvang): Add cost of signaling wedge itself to 'rate'.
+      model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, *best_sse,
+                                                    N, &rate, &dist);
       const int64_t rd = RDCOST(x->rdmult, rate, dist);
-      // TODO(urvang): Subtrack rate of signaling wedge (like pick_wedge)?
       return rd;
     }
     return INT64_MAX;
@@ -385,10 +388,11 @@
 
 #if CONFIG_ARBITRARY_WEDGE
   if (av1_wedge_params_lookup[bsize].codebook == NULL) {
-    // TODO(urvang): Reuse seg_mask or have a different wedge_mask array?
     mbmi->interinter_comp.seg_mask = xd->seg_mask;
-    rd = pick_arbitrary_wedge(cpi, x, bsize, residual1, diff10,
-                              mbmi->interinter_comp.seg_mask, best_sse);
+    mbmi->interinter_comp.seg_mask_smoothed = xd->seg_mask_smoothed;
+    rd = pick_arbitrary_wedge(
+        cpi, x, bsize, residual1, diff10, mbmi->interinter_comp.seg_mask,
+        mbmi->interinter_comp.seg_mask_smoothed, best_sse);
     mbmi->interinter_comp.wedge_sign = 0;
     mbmi->interinter_comp.wedge_index = -1;
     return rd;
@@ -1135,15 +1139,15 @@
       // 3*n, as storing each length takes 2 bytes.
       uint8_t rle_buf[3 * MAX_SB_SQUARE];
       int rle_size = 0;
+      int rle_rate = INT_MAX;
       av1_run_length_encode(mbmi->interinter_comp.seg_mask, bw, bh, bw, rle_buf,
-                            &rle_size);
-      return rle_size;
+                            &rle_size, &rle_rate);
+      return rle_rate;
     }
 #endif  // CONFIG_ARBITRARY_WEDGE
     return av1_cost_literal(1) +
-                     mode_costs
-                         ->wedge_idx_cost[mbmi->sb_type]
-                                         [mbmi->interinter_comp.wedge_index];
+           mode_costs->wedge_idx_cost[mbmi->sb_type]
+                                     [mbmi->interinter_comp.wedge_index];
   } else {
     assert(compound_type == COMPOUND_DIFFWTD);
     return av1_cost_literal(1);
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index dedc8b3..7d7f5e6 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -691,6 +691,9 @@
   int_mv tmp_mv[2];
   int tmp_rate_mv = 0;
   mbmi->interinter_comp.seg_mask = xd->seg_mask;
+#if CONFIG_ARBITRARY_WEDGE
+  mbmi->interinter_comp.seg_mask_smoothed = xd->seg_mask_smoothed;
+#endif  // CONFIG_ARBITRARY_WEDGE
   const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
 
   if (this_mode == NEW_NEWMV) {
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index dbe7e4b..6d75d87 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c
@@ -368,6 +368,9 @@
   struct buf_2d *const dst_buf = &pd->dst;
   uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
   mbmi->interinter_comp.seg_mask = xd->seg_mask;
+#if CONFIG_ARBITRARY_WEDGE
+  mbmi->interinter_comp.seg_mask_smoothed = xd->seg_mask_smoothed;
+#endif  // CONFIG_ARBITRARY_WEDGE
   const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
   const int is_hbd = is_cur_buf_hbd(xd);
 
diff --git a/av1/encoder/segment_patch.cc b/av1/encoder/segment_patch.cc
index 7080d50..d71750d 100644
--- a/av1/encoder/segment_patch.cc
+++ b/av1/encoder/segment_patch.cc
@@ -1,8 +1,12 @@
 #include <assert.h>
 #include <unordered_map>
+#if DUMP_SEGMENT_MASKS
+#include <fstream>
+#endif // DUMP_SEGMENT_MASKS
 
 #include "aom_mem/aom_mem.h"
 #include "av1/common/enums.h"
+#include "av1/encoder/cost.h"
 #include "av1/encoder/segment_patch.h"
 #include "third_party/segment/segment-image.h"
 
@@ -164,19 +168,29 @@
 }
 
 void av1_run_length_encode(const uint8_t *const img, int width, int height,
-                           int stride, uint8_t *out, int *out_size) {
+                           int stride, uint8_t *out, int *out_size,
+                           int *out_rate) {
   int out_idx = 0;
   uint8_t prev_val = img[0];
   int run_len = 1;
+  int num_bits = 0;
+  const int bits_per_symbol = 1;  // Assuming 2 segments exactly.
+  // Assuming 64x64 or 128x128 block sizes only.
+  assert((width == 64 && height == 64) || (width == 128 && height == 128));
+  const int bits_per_run_len =
+      (width == 128 && height == 128) ? 14 : 12;  // log2(width * height)
 
   for (int r = 0; r < height; ++r) {
     for (int c = (r == 0) ? 1 : 0; c < width; ++c) {
       const uint8_t curr_val = img[r * stride + c];
+      assert(curr_val < (1 << bits_per_symbol));
       if (curr_val == prev_val) {
         ++run_len;
       } else {
         out[out_idx++] = prev_val;
+        num_bits += bits_per_symbol;
         write_run_length(run_len, out, &out_idx);
+        num_bits += bits_per_run_len;
         run_len = 1;
         prev_val = curr_val;
       }
@@ -185,6 +199,7 @@
   out[out_idx++] = prev_val;
   write_run_length(run_len, out, &out_idx);
   *out_size = out_idx;
+  *out_rate = av1_cost_literal(num_bits);
 }
 
 #if DUMP_SEGMENT_MASKS
diff --git a/av1/encoder/segment_patch.h b/av1/encoder/segment_patch.h
index 41fde1a..567ee4a 100644
--- a/av1/encoder/segment_patch.h
+++ b/av1/encoder/segment_patch.h
@@ -56,9 +56,12 @@
 // - stride: image stride
 // Outputs:
 // - out: run-length encoded image. Assumed to be already allocated.
-// - out_size: length of 'out'
+// - out_size: length of 'out' in bytes.
+// - out_rate: length of transmitting 'out' in bits, but scaled by 512.
+// TODO(urvang): Need to write in bits directly and consolidate last two.
 void av1_run_length_encode(const uint8_t *const img, int width, int height,
-                           int stride, uint8_t *out, int *out_size);
+                           int stride, uint8_t *out, int *out_size,
+                           int *out_rate);
 
 #define DUMP_SEGMENT_MASKS 0