Multiframe quality enhancement postprocessing

Adds a multiframe postprocessing module to enhance the quality of
certain frames that are coded at lower quality than preceding frames.
The module can be invoked from the commandline by use of the --mfqe
option, and will be most beneficial for enhancing the quality of
frames decoded using scalable patterns.

Uses the vp8_variance_var16x16 and vp8_variance_sad16x16 function
pointers to compute SAD and Variance of blocks.

Change-Id: Id73d2a6e3572d07f9f8e36bbce00a4fc5ffd8961
diff --git a/examples/postproc.txt b/examples/postproc.txt
index 0940ea2..51b251a 100644
--- a/examples/postproc.txt
+++ b/examples/postproc.txt
@@ -58,7 +58,7 @@
     if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
         die_codec(&codec, "Failed to turn off postproc");
 } else if(frame_cnt%30 == 16) {
-    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK, 4, 0};
+    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, 0};
 
     if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
         die_codec(&codec, "Failed to turn on postproc");
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index ace4c11..0ef3009 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -12,9 +12,12 @@
 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
+#include "common.h"
+#include "recon.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
+#include "../encoder/variance.h"
 
 #include <math.h>
 #include <stdlib.h>
@@ -121,7 +124,6 @@
     0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 
-
 extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
 extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
 /***********************************************************************************************************
@@ -323,11 +325,11 @@
 }
 
 void vp8_deblock(YV12_BUFFER_CONFIG         *source,
-                        YV12_BUFFER_CONFIG         *post,
-                        int                         q,
-                        int                         low_var_thresh,
-                        int                         flag,
-                        vp8_postproc_rtcd_vtable_t *rtcd)
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag,
+                 vp8_postproc_rtcd_vtable_t *rtcd)
 {
     double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
     int ppl = (int)(level + .5);
@@ -671,6 +673,210 @@
     }
 }
 
+static void multiframe_quality_enhance_block
+(
+    int blksize, /* Currently only values supported are 16, 8, 4 */
+    int qcurr,
+    int qprev,
+    unsigned char *y,
+    unsigned char *u,
+    unsigned char *v,
+    int y_stride,
+    int uv_stride,
+    unsigned char *yd,
+    unsigned char *ud,
+    unsigned char *vd,
+    int yd_stride,
+    int uvd_stride
+)
+{
+    static const unsigned char VP8_ZEROS[16]=
+    {
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    };
+    int blksizeby2 = blksize >> 1;
+    int blksizesq = blksize * blksize;
+
+    int i, j;
+    unsigned char *yp;
+    unsigned char *ydp;
+    unsigned char *up;
+    unsigned char *udp;
+    unsigned char *vp;
+    unsigned char *vdp;
+
+    unsigned int act, sse, sad, thr;
+    if (blksize == 16)
+    {
+        act = vp8_variance_var16x16(y, y_stride, VP8_ZEROS, 0, &sse);
+        sad = vp8_variance_sad16x16(y, y_stride, yd, yd_stride, 0);
+    }
+    else if (blksize == 8)
+    {
+        act = vp8_variance_var8x8(y, y_stride, VP8_ZEROS, 0, &sse);
+        sad = vp8_variance_sad8x8(y, y_stride, yd, yd_stride, 0);
+    }
+    else
+    {
+        act = vp8_variance_var4x4(y, y_stride, VP8_ZEROS, 0, &sse);
+        sad = vp8_variance_sad4x4(y, y_stride, yd, yd_stride, 0);
+    }
+
+    thr = 6 * blksizesq + (act >> 3);
+    if (thr > 12 * blksizesq) thr = 12 * blksizesq;
+    // These thresholds should be adapted later based on qcurr and qprev
+    if (sad < thr)
+    {
+        static const int precision = 4;
+        static const int roundoff = (1 << (precision - 1));
+        int ifactor = (sad << precision) / thr;
+        // TODO: SIMD optimize this section
+        if (ifactor)
+        {
+            int icfactor = (1 << precision) - ifactor;
+            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
+            {
+                for (j = 0; j < blksize; ++j)
+                    ydp[j] = (int)((yp[j] * ifactor + ydp[j] * icfactor + roundoff) >> precision);
+            }
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+            {
+                for (j = 0; j < blksizeby2; ++j)
+                    udp[j] = (int)((up[j] * ifactor + udp[j] * icfactor + roundoff) >> precision);
+            }
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+            {
+                for (j = 0; j < blksizeby2; ++j)
+                    vdp[j] = (int)((vp[j] * ifactor + vdp[j] * icfactor + roundoff) >> precision);
+            }
+        }
+    }
+    else
+    {
+        if (blksize == 16)
+        {
+            vp8_recon_copy16x16(y, y_stride, yd, yd_stride);
+            vp8_recon_copy8x8(u, uv_stride, ud, uvd_stride);
+            vp8_recon_copy8x8(v, uv_stride, vd, uvd_stride);
+        }
+        else if (blksize == 8)
+        {
+            vp8_recon_copy8x8(y, y_stride, yd, yd_stride);
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, blksizeby2);
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, blksizeby2);
+        }
+        else
+        {
+            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
+                vpx_memcpy(ydp, yp, blksize);
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, blksizeby2);
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, blksizeby2);
+        }
+    }
+}
+
+void vp8_multiframe_quality_enhance
+(
+    VP8_COMMON *cm
+)
+{
+    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+    FRAME_TYPE frame_type = cm->frame_type;
+    /* Point at base of Mb MODE_INFO list has motion vectors etc */
+    const MODE_INFO *mode_info_context = cm->mi;
+    int qcurr = cm->base_qindex;
+    int qprev = cm->postproc_state.last_base_qindex;
+
+    int mb_row;
+    int mb_col;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+    /* Set up the buffer pointers */
+    y_ptr = show->y_buffer;
+    u_ptr = show->u_buffer;
+    v_ptr = show->v_buffer;
+    yd_ptr = dest->y_buffer;
+    ud_ptr = dest->u_buffer;
+    vd_ptr = dest->v_buffer;
+
+    /* postprocess each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* if motion is high there will likely be no benefit */
+            if (((frame_type == INTER_FRAME &&
+                  abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 &&
+                  abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) ||
+                 (frame_type == KEY_FRAME)) &&
+                mode_info_context->mbmi.mode != B_PRED)
+            {
+                multiframe_quality_enhance_block(16,
+                                                 qcurr,
+                                                 qprev,
+                                                 y_ptr,
+                                                 u_ptr,
+                                                 v_ptr,
+                                                 show->y_stride,
+                                                 show->uv_stride,
+                                                 yd_ptr,
+                                                 ud_ptr,
+                                                 vd_ptr,
+                                                 dest->y_stride,
+                                                 dest->uv_stride);
+            }
+            else if (mode_info_context->mbmi.mode == B_PRED)
+            {
+                int i, j;
+                for (i=0; i<2; ++i)
+                    for (j=0; j<2; ++j)
+                        multiframe_quality_enhance_block(8,
+                                                         qcurr,
+                                                         qprev,
+                                                         y_ptr + 8*(i*show->y_stride+j),
+                                                         u_ptr + 4*(i*show->uv_stride+j),
+                                                         v_ptr + 4*(i*show->uv_stride+j),
+                                                         show->y_stride,
+                                                         show->uv_stride,
+                                                         yd_ptr + 8*(i*dest->y_stride+j),
+                                                         ud_ptr + 4*(i*dest->uv_stride+j),
+                                                         vd_ptr + 4*(i*dest->uv_stride+j),
+                                                         dest->y_stride,
+                                                         dest->uv_stride);
+            }
+            else
+            {
+                vp8_recon_copy16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+                vp8_recon_copy8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+                vp8_recon_copy8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+            }
+            y_ptr += 16;
+            u_ptr += 8;
+            v_ptr += 8;
+            yd_ptr += 16;
+            ud_ptr += 8;
+            vd_ptr += 8;
+            mode_info_context++;     /* step to next MB */
+        }
+
+        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
+        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
+        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+
+        mode_info_context++;         /* Skip border mb */
+    }
+}
 
 #if CONFIG_RUNTIME_CPU_DETECT
 #define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
@@ -699,8 +905,8 @@
         dest->y_width = oci->Width;
         dest->y_height = oci->Height;
         dest->uv_height = dest->y_height / 2;
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
         return 0;
-
     }
 
 #if ARCH_X86||ARCH_X86_64
@@ -717,6 +923,12 @@
         vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
                     q, 1, 0, RTCD_VTABLE(oci));
     }
+    else if ((flags & VP8D_MFQE) &&
+             oci->current_video_frame >= 2 &&
+             oci->base_qindex - oci->postproc_state.last_base_qindex >= 10)
+    {
+        vp8_multiframe_quality_enhance(oci);
+    }
     else
     {
         vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
@@ -1105,5 +1317,6 @@
     dest->y_width = oci->Width;
     dest->y_height = oci->Height;
     dest->uv_height = dest->y_height / 2;
+    oci->postproc_state.last_base_qindex = oci->base_qindex;
     return 0;
 }
diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h
index c641b9c..d5aaf62 100644
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -104,6 +104,7 @@
     int           last_q;
     int           last_noise;
     char          noise[3072];
+    int           last_base_qindex;
     DECLARE_ALIGNED(16, char, blackclamp[16]);
     DECLARE_ALIGNED(16, char, whiteclamp[16]);
     DECLARE_ALIGNED(16, char, bothclamp[16]);
diff --git a/vp8/common/ppflags.h b/vp8/common/ppflags.h
index 65b0cab..665e21f 100644
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -23,7 +23,8 @@
     VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
     VP8D_DEBUG_DRAW_MV          = 1<<7,
     VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
-    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
+    VP8D_MFQE                   = 1<<10
 };
 
 typedef struct
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 54bdb85..43ea9a1 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -412,7 +412,7 @@
                 && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
             {
                 ctx->postproc_cfg.post_proc_flag =
-                    VP8_DEBLOCK | VP8_DEMACROBLOCK;
+                    VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
                 ctx->postproc_cfg.deblocking_level = 4;
                 ctx->postproc_cfg.noise_level = 0;
             }
diff --git a/vpx/vp8.h b/vpx/vp8.h
index 983cc4a..eec9797 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -63,6 +63,7 @@
     VP8_DEBUG_TXT_MBLK_MODES    = 1<<4, /**< print macro block modes over each macro block */
     VP8_DEBUG_TXT_DC_DIFF       = 1<<5, /**< print dc diff for each macro block */
     VP8_DEBUG_TXT_RATE_INFO     = 1<<6, /**< print video rate info (encoder only) */
+    VP8_MFQE                    = 1<<10,
 };
 
 /*!\brief post process flags
diff --git a/vpxdec.c b/vpxdec.c
index 7401101..4482f3d 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -124,11 +124,13 @@
                                        "Display only selected block modes");
 static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
                                        "Draw only selected motion vectors");
+static const arg_def_t mfqe = ARG_DEF(NULL, "mfqe", 0,
+                                       "Enable multiframe quality enhancement");
 
 static const arg_def_t *vp8_pp_args[] =
 {
     &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
-    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs,
+    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs, &mfqe,
     NULL
 };
 #endif
@@ -803,6 +805,11 @@
             postproc = 1;
             vp8_pp_cfg.post_proc_flag |= VP8_DEBLOCK;
         }
+        else if (arg_match(&arg, &mfqe, argi))
+        {
+            postproc = 1;
+            vp8_pp_cfg.post_proc_flag |= VP8_MFQE;
+        }
         else if (arg_match(&arg, &pp_debug_info, argi))
         {
             unsigned int level = arg_parse_uint(&arg);