Refactoring hybrid transform coding The forward and inverse hybrid transforms are now performed using single function modules, where the dimension is sent as argument. Added an inline function clip8b to clip the reconstruction pixels into range of 0-255. Change-Id: Id7d870b3e1aefc092721c80c0af6f641eb5f3747

diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index 5336f5a..2a410c3 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h

@@ -111,9 +111,10 @@
 extern prototype_second_order(vp8_short_inv_walsh4x4_1_lossless_c);
 #endif
 
-#if CONFIG_HYBRIDTRANSFORM
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
 #include "vp8/common/blockd.h"
-void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type);
+void vp8_ihtllm_c(short *input, short *output, int pitch,
+                  TX_TYPE tx_type, int tx_dim);
 #endif
 
 

diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 616e493..5c7bf78 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c

@@ -93,120 +93,17 @@
 };
 #endif
 
-#if CONFIG_HYBRIDTRANSFORM
-void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
-  int i, j, k;
-  float bufa[16], bufb[16]; // buffers are for floating-point test purpose
-                            // the implementation could be simplified in
-                            // conjunction with integer transform
-  short *ip = input;
-  short *op = output;
-  int shortpitch = pitch >> 1;
-
-  float *pfa = &bufa[0];
-  float *pfb = &bufb[0];
-
-  // pointers to vertical and horizontal transforms
-  float *ptv, *pth;
-
-  // load and convert residual array into floating-point
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfa[i] = (float)ip[i];
-    }
-    pfa += 4;
-    ip  += 4;
-  }
-
-  // vertical transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case ADST_DCT  :
-      ptv = &iadst_4[0];
-      break;
-
-    default :
-      ptv = &idct_4[0];
-      break;
-  }
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfb[i] = 0 ;
-      for(k = 0; k < 4; k++) {
-        pfb[i] += ptv[k] * pfa[(k<<2)];
-      }
-      pfa += 1;
-    }
-
-    pfb += 4;
-    ptv += 4;
-    pfa = &bufa[0];
-  }
-
-  // horizontal transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case  DCT_ADST :
-      pth = &iadst_4[0];
-      break;
-
-    default :
-      pth = &idct_4[0];
-      break;
-  }
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfa[i] = 0;
-      for(k = 0; k < 4; k++) {
-        pfa[i] += pfb[k] * pth[k];
-      }
-      pth += 4;
-     }
-
-    pfa += 4;
-    pfb += 4;
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = &iadst_4[0];
-        break;
-
-      default :
-        pth = &idct_4[0];
-        break;
-    }
-  }
-
-  // convert to short integer format and load BLOCKD buffer
-  op  = output;
-  pfa = &bufa[0];
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
-                             -(short)( - pfa[i] / 8 + 0.49);
-    }
-    op  += shortpitch;
-    pfa += 4;
-  }
-}
-#endif
-
-#if CONFIG_HYBRIDTRANSFORM8X8
-void vp8_iht8x8llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
+void vp8_ihtllm_c(short *input, short *output, int pitch,
+                  TX_TYPE tx_type, int tx_dim) {
   int i, j, k;
   float bufa[64], bufb[64]; // buffers are for floating-point test purpose
                             // the implementation could be simplified in
                             // conjunction with integer transform
+
+                            // further notice, since we are thinking to use one
+                            // function for both 4x4 and 8x8 transforms, the
+                            // temporary buffers are simply initialized with 64.
   short *ip = input;
   short *op = output;
   int shortpitch = pitch >> 1;
@@ -218,12 +115,12 @@
   float *ptv, *pth;
 
   // load and convert residual array into floating-point
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfa[i] = (float)ip[i];
     }
-    pfa += 8;
-    ip  += 8;
+    pfa += tx_dim;
+    ip  += tx_dim;
   }
 
   // vertical transformation
@@ -233,25 +130,25 @@
   switch(tx_type) {
     case ADST_ADST :
     case ADST_DCT  :
-      ptv = &iadst_8[0];
+      ptv = (tx_dim == 4) ? &iadst_4[0] : &iadst_8[0];
       break;
 
     default :
-      ptv = &idct_8[0];
+      ptv = (tx_dim == 4) ? &idct_4[0] : &idct_8[0];
       break;
   }
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfb[i] = 0 ;
-      for(k = 0; k < 8; k++) {
-        pfb[i] += ptv[k] * pfa[(k<<3)];
+      for(k = 0; k < tx_dim; k++) {
+        pfb[i] += ptv[k] * pfa[(k * tx_dim)];
       }
       pfa += 1;
     }
 
-    pfb += 8;
-    ptv += 8;
+    pfb += tx_dim;
+    ptv += tx_dim;
     pfa = &bufa[0];
   }
 
@@ -262,34 +159,34 @@
   switch(tx_type) {
     case ADST_ADST :
     case  DCT_ADST :
-      pth = &iadst_8[0];
+      pth = (tx_dim == 4) ? &iadst_4[0] : &iadst_8[0];
       break;
 
     default :
-      pth = &idct_8[0];
+      pth = (tx_dim == 4) ? &idct_4[0] : &idct_8[0];
       break;
   }
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfa[i] = 0;
-      for(k = 0; k < 8; k++) {
+      for(k = 0; k < tx_dim; k++) {
         pfa[i] += pfb[k] * pth[k];
       }
-      pth += 8;
+      pth += tx_dim;
      }
 
-    pfa += 8;
-    pfb += 8;
+    pfa += tx_dim;
+    pfb += tx_dim;
 
     switch(tx_type) {
       case ADST_ADST :
       case  DCT_ADST :
-        pth = &iadst_8[0];
+        pth = (tx_dim == 4) ? &iadst_4[0] : &iadst_8[0];
         break;
 
       default :
-        pth = &idct_8[0];
+        pth = (tx_dim == 4) ? &idct_4[0] : &idct_8[0];
         break;
     }
   }
@@ -298,13 +195,14 @@
   op  = output;
   pfa = &bufa[0];
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
                              -(short)( - pfa[i] / 8 + 0.49);
     }
+
     op  += shortpitch;
-    pfa += 8;
+    pfa += tx_dim;
   }
 }
 #endif

diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
index de9aad5..1357839 100644
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c

@@ -33,7 +33,7 @@
 
 #if CONFIG_HYBRIDTRANSFORM
 void vp8_inverse_htransform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch) {
-  vp8_iht4x4llm_c(b->dqcoeff, b->diff, pitch, b->bmi.as_mode.tx_type);
+  vp8_ihtllm_c(b->dqcoeff, b->diff, pitch, b->bmi.as_mode.tx_type, 4);
 }
 #endif
 

diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 057104f..59f453e 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c

@@ -392,7 +392,6 @@
       txfm_map(b, pred_mode_conv(i8x8mode));
       vp8_ht_dequant_idct_add_8x8_c(b->bmi.as_mode.tx_type,
                                     q, dq, pre, dst, 16, stride);
-      // vp8_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
       q += 64;
 #else
       for (j = 0; j < 4; j++) {

diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index bf44fd6..6164c44 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c

@@ -55,7 +55,7 @@
     input[i] = dq[i] * input[i];
   }
 
-  vp8_iht4x4llm_c( input, output, 4 << 1, tx_type );
+  vp8_ihtllm_c(input, output, 4 << 1, tx_type, 4);
 
   vpx_memset(input, 0, 32);
 
@@ -95,7 +95,7 @@
     input[i] = dq[1] * input[i];
   }
 
-  vp8_iht8x8llm_c(input, output, 16, tx_type);
+  vp8_ihtllm_c(input, output, 16, tx_type, 8);
 
   vpx_memset(input, 0, 128);
 
@@ -117,9 +117,10 @@
       diff_ptr += 8;
       pred += pitch;
     }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
+    // shift buffer pointers to next 4x4 block in the submacroblock
+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
+    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
+    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
   }
 }
 #endif

diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index ad52585..ae19129 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c

@@ -329,114 +329,9 @@
 
 }
 
-#if CONFIG_HYBRIDTRANSFORM
-void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
-  int i, j, k;
-  float bufa[16], bufb[16]; // buffers are for floating-point test purpose
-                             // the implementation could be simplified in
-                             // conjunction with integer transform
-  short *ip = input;
-  short *op = output;
-
-  float *pfa = &bufa[0];
-  float *pfb = &bufb[0];
-
-  // pointers to vertical and horizontal transforms
-  float *ptv, *pth;
-
-  // load and convert residual array into floating-point
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfa[i] = (float)ip[i];
-    }
-    pfa += 4;
-    ip  += pitch / 2;
-  }
-
-  // vertical transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case ADST_DCT  :
-      ptv = &adst_4[0];
-      break;
-
-    default :
-      ptv = &dct_4[0];
-      break;
-  }
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfb[i] = 0;
-      for(k = 0; k < 4; k++) {
-        pfb[i] += ptv[k] * pfa[(k<<2)];
-      }
-      pfa += 1;
-    }
-    pfb += 4;
-    ptv += 4;
-    pfa = &bufa[0];
-  }
-
-  // horizontal transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case  DCT_ADST :
-      pth = &adst_4[0];
-      break;
-
-    default :
-      pth = &dct_4[0];
-      break;
-  }
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfa[i] = 0;
-      for(k = 0; k < 4; k++) {
-        pfa[i] += pfb[k] * pth[k];
-      }
-      pth += 4;
-     }
-
-    pfa += 4;
-    pfb += 4;
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = &adst_4[0];
-        break;
-
-      default :
-        pth = &dct_4[0];
-        break;
-    }
-  }
-
-  // convert to short integer format and load BLOCKD buffer
-  op  = output ;
-  pfa = &bufa[0] ;
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
-                                   -(short)(- 8 * pfa[i] + 0.49);
-    }
-    op  += 4;
-    pfa += 4;
-  }
-}
-#endif
-
-#if CONFIG_HYBRIDTRANSFORM8X8
-void vp8_fht8x8_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
+void vp8_fht_c(short *input, short *output, int pitch,
+               TX_TYPE tx_type, int tx_dim) {
   int i, j, k;
   float bufa[64], bufb[64]; // buffers are for floating-point test purpose
                              // the implementation could be simplified in
@@ -451,11 +346,11 @@
   float *ptv, *pth;
 
   // load and convert residual array into floating-point
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfa[i] = (float)ip[i];
     }
-    pfa += 8;
+    pfa += tx_dim;
     ip  += pitch / 2;
   }
 
@@ -466,24 +361,24 @@
   switch(tx_type) {
     case ADST_ADST :
     case ADST_DCT  :
-      ptv = &adst_8[0];
+      ptv = (tx_dim == 4) ? &adst_4[0] : &adst_8[0];
       break;
 
     default :
-      ptv = &dct_8[0];
+      ptv = (tx_dim == 4) ? &dct_4[0] : &dct_8[0];
       break;
   }
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfb[i] = 0;
-      for(k = 0; k < 8; k++) {
-        pfb[i] += ptv[k] * pfa[(k<<3)];
+      for(k = 0; k < tx_dim; k++) {
+        pfb[i] += ptv[k] * pfa[(k * tx_dim)];
       }
       pfa += 1;
     }
-    pfb += 8;
-    ptv += 8;
+    pfb += tx_dim;
+    ptv += tx_dim;
     pfa = &bufa[0];
   }
 
@@ -494,34 +389,34 @@
   switch(tx_type) {
     case ADST_ADST :
     case  DCT_ADST :
-      pth = &adst_8[0];
+      pth = (tx_dim == 4) ? &adst_4[0] : &adst_8[0];
       break;
 
     default :
-      pth = &dct_8[0];
+      pth = (tx_dim == 4) ? &dct_4[0] : &dct_8[0];
       break;
   }
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfa[i] = 0;
-      for(k = 0; k < 8; k++) {
+      for(k = 0; k < tx_dim; k++) {
         pfa[i] += pfb[k] * pth[k];
       }
-      pth += 8;
+      pth += tx_dim;
      }
 
-    pfa += 8;
-    pfb += 8;
+    pfa += tx_dim;
+    pfb += tx_dim;
 
     switch(tx_type) {
       case ADST_ADST :
       case  DCT_ADST :
-        pth = &adst_8[0];
+        pth = (tx_dim == 4) ? &adst_4[0] : &adst_8[0];
         break;
 
       default :
-        pth = &dct_8[0];
+        pth = (tx_dim == 4) ? &dct_4[0] : &dct_8[0];
         break;
     }
   }
@@ -530,13 +425,13 @@
   op  = output ;
   pfa = &bufa[0] ;
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
                                    -(short)(- 8 * pfa[i] + 0.49);
     }
-    op  += 8;
-    pfa += 8;
+    op  += tx_dim;
+    pfa += tx_dim;
   }
 }
 #endif
@@ -582,14 +477,6 @@
   }
 }
 
-#if CONFIG_HYBRIDTRANSFORM
-void vp8_fht8x4_c(short *input, short *output, int pitch,
-                  TX_TYPE tx_type) {
-  vp8_fht4x4_c(input,     output,      pitch, tx_type);
-  vp8_fht4x4_c(input + 4, output + 16, pitch, tx_type);
-}
-#endif
-
 void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
 {
     vp8_short_fdct4x4_c(input,   output,    pitch);

diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
index 9936969..2d7b617 100644
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h

@@ -23,9 +23,9 @@
 #endif
 
 
-#if CONFIG_HYBRIDTRANSFORM
-void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);
-void vp8_fht8x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
+void vp8_fht_c(short *input, short *output, int pitch,
+               TX_TYPE tx_type, int tx_dim);
 #endif
 
 #if CONFIG_TX16X16

diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 964046d..c404901 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c

@@ -91,8 +91,7 @@
     if(active_ht) {
       b->bmi.as_mode.test = b->bmi.as_mode.first;
       txfm_map(b, b->bmi.as_mode.first);
-
-      vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
+      vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 4);
       vp8_ht_quantize_b(be, b);
       vp8_inverse_htransform_b(IF_RTCD(&rtcd->common->idct), b, 32) ;
     } else {
@@ -317,16 +316,11 @@
     vp8_subtract_4b_c(be, b, 16);
 
     txfm_map(b, pred_mode_conv(b->bmi.as_mode.first));
-
-    vp8_fht8x8_c(be->src_diff, (x->block + idx)->coeff, 32,
-                 b->bmi.as_mode.tx_type);
+    vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
+              b->bmi.as_mode.tx_type, 8);
     x->quantize_b_8x8(x->block + idx, xd->block + idx);
-    vp8_iht8x8llm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
-                    b->bmi.as_mode.tx_type);
-
-//    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-//    x->quantize_b_8x8(x->block + idx, xd->block + idx);
-//    vp8_short_idct8x8_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+    vp8_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
+                 b->bmi.as_mode.tx_type, 8);
 
     // reconstruct submacroblock
     for (i = 0; i < 4; i++) {

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 3f2b8e8..67bf33d 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c

@@ -612,20 +612,20 @@
         if((type == PLANE_TYPE_Y_WITH_DC) && active_ht) {
           switch (b->bmi.as_mode.tx_type) {
             case ADST_DCT:
-              pt_scan = vp8_row_scan;
+              scan = vp8_row_scan;
               break;
 
             case DCT_ADST:
-              pt_scan = vp8_col_scan;
+              scan = vp8_col_scan;
               break;
 
             default:
-              pt_scan = vp8_default_zig_zag1d;
+              scan = vp8_default_zig_zag1d;
               break;
           }
 
         } else
-          pt_scan = vp8_default_zig_zag1d;
+          scan = vp8_default_zig_zag1d;
       }
 #endif
       break;
@@ -937,8 +937,7 @@
       if(active_ht) {
         b->bmi.as_mode.test = mode;
         txfm_map(b, mode);
-
-        vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
+        vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 4);
         vp8_ht_quantize_b(be, b);
       } else {
         x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
@@ -991,7 +990,7 @@
 
   // inverse transform
   if(active_ht) {
-    vp8_iht4x4llm_c(best_dqcoeff, b->diff, 32, b->bmi.as_mode.tx_type );
+    vp8_ihtllm_c(best_dqcoeff, b->diff, 32, b->bmi.as_mode.tx_type, 4);
   } else {
     IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff,
                                                                 b->diff, 32);
@@ -1230,8 +1229,8 @@
 
 #if CONFIG_HYBRIDTRANSFORM8X8
       txfm_map(b, pred_mode_conv(mode));
-      vp8_fht8x8_c(be->src_diff, (x->block + idx)->coeff, 32, b->bmi.as_mode.tx_type);
-//    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
+                b->bmi.as_mode.tx_type, 8);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
 
       // compute quantization mse of 8x8 block