Alternate reference frame

This commit re-designs the alternate reference frame generation
process. It employs non-local mean approach to produce more stable
pixel estimation for alternate reference frame. It improves the
compression performance gains:
derf   0.5%
hevcmr 0.8%
stdhd  1.3%
hevchr 1.0%

The encoding time at speed 0 is not affected.

Change-Id: Iaa757f0da189ce93812d69617a81bf630d449848
diff --git a/vp10/encoder/temporal_filter.c b/vp10/encoder/temporal_filter.c
index 5278d3b..d5f896a 100644
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c
@@ -135,15 +135,38 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier  *= modifier;
-      modifier  *= 3;
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
       modifier  += rounding;
       modifier >>= strength;
 
@@ -416,16 +439,16 @@
           }
 #else
           // Apply the filter (YUV)
-          vp10_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+          vp10_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                     predictor, 16, 16,
                                     strength, filter_weight,
                                     accumulator, count);
-          vp10_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+          vp10_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
                                     predictor + 256,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 256,
                                     count + 256);
-          vp10_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+          vp10_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
                                     predictor + 512,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 512,