Merge "Eliminate num_8x8 and num_4x4 width/height lookups"
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 6294af1..4ee4ad4 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -494,6 +494,14 @@
         make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
 #endif
 
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+#endif
+
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 524e79f..538f1ed 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -475,7 +475,7 @@
     specialize qw/vp9_iwht4x4_1_add msa/;
 
     add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_iwht4x4_16_add msa/;
+    specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";
   }
 }
 
diff --git a/vp9/common/x86/vp9_idct_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
new file mode 100644
index 0000000..9619e37
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_sse2.asm
@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m2
+  psubw           m3,        m1
+  psubw           m4,        m0,        m3
+  psraw           m4,        1
+  psubw           m5,        m4,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m5
+  paddw           m3,        m4
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  SWAP            4, 0
+  SWAP            5, 2
+  punpcklwd       m4,        m1
+  pshufd          m0,        m4, 0x0e
+  punpcklwd       m5,        m3
+  pshufd          m2,        m5, 0x0e
+  SWAP            1, 4
+  SWAP            3, 0
+  punpckldq       m1,        m5
+  pshufd          m4,        m1, 0x0e
+  punpckldq       m3,        m2
+  pshufd          m0,        m3, 0x0e
+  SWAP            2, 3, 0, 1, 4
+%endmacro
+
+; transposes a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+  mova            m3, m0
+  punpcklwd       m0, m1
+  punpckhwd       m3, m1
+  mova            m2, m0
+  punpcklwd       m0, m3
+  punpckhwd       m2, m3
+  pshufd          m1, m0, 0x0e
+  pshufd          m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
+  movq            m%3,       [outputq]
+  movq            m%4,       [outputq + strideq]
+  punpcklbw       m%3,       m%5
+  punpcklbw       m%4,       m%5
+  paddw           m%1,       m%3
+  paddw           m%2,       m%4
+  packuswb        m%1,       m%5
+  packuswb        m%2,       m%5
+  movd            [outputq], m%1
+  movd            [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 4, 7, input, output, stride
+  mova            m0,        [inputq +  0] ;a1
+  mova            m1,        [inputq + 16] ;c1
+
+  psraw           m0,        2
+  psraw           m1,        2
+
+  TRANSPOSE_4X4_WIDE
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  TRANSFORM_COLS
+
+  pxor            m4, m4
+  ADD_STORE_4P_2X  0, 1, 5, 6, 4
+  lea             outputq, [outputq + 2 * strideq]
+  ADD_STORE_4P_2X  2, 3, 5, 6, 4
+
+  RET
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 02d986e..4f44b43 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -194,7 +194,6 @@
 
   set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
 
-
   mbmi = &xd->mi[0]->mbmi;
 
   // Set up destination pointers.
@@ -2123,38 +2122,6 @@
   BLOCK_64X64
 };
 
-// Checks to see if a macro block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_edge_sb(VP9_COMP *cpi,
-                          int mi_row, int mi_col) {
-  int is_active_edge = 0;
-  int top_edge = 0;
-  int bottom_edge = cpi->common.mi_rows;
-  int left_edge = 0;
-  int right_edge = cpi->common.mi_cols;
-
-  // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    TWO_PASS *twopass = &cpi->twopass;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-
-    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-    bottom_edge = MAX(top_edge, bottom_edge);
-  }
-
-  if (((top_edge >= mi_row) && (top_edge < (mi_row + MI_BLOCK_SIZE))) ||
-      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + MI_BLOCK_SIZE))) ||
-      ((left_edge >= mi_col) && (left_edge < (mi_col + MI_BLOCK_SIZE))) ||
-      ((right_edge >= mi_col) && (right_edge < (mi_col + MI_BLOCK_SIZE)))) {
-    is_active_edge = 1;
-  }
-
-  return is_active_edge;
-}
 
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
@@ -2253,7 +2220,7 @@
   // Test for blocks at the edge of the active image.
   // This may be the actual edge of the image or where there are formatting
   // bars.
-  if (active_edge_sb(cpi, mi_row, mi_col)) {
+  if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
     min_size = MIN(cpi->sf.rd_auto_partition_min_limit,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3130941..659ce72 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -763,7 +763,7 @@
 }
 
 void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                               TX_SIZE tx_size, void *arg) {
+                            TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a85b70b..61279f8 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -126,6 +126,7 @@
             stats->pcnt_neutral,
             stats->intra_skip_pct,
             stats->inactive_zone_rows,
+            stats->inactive_zone_cols,
             stats->MVr,
             stats->mvr_abs,
             stats->MVc,
@@ -164,6 +165,7 @@
   section->pcnt_neutral = 0.0;
   section->intra_skip_pct = 0.0;
   section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
   section->MVr = 0.0;
   section->mvr_abs     = 0.0;
   section->MVc        = 0.0;
@@ -191,6 +193,7 @@
   section->pcnt_neutral += frame->pcnt_neutral;
   section->intra_skip_pct += frame->intra_skip_pct;
   section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
   section->MVr += frame->MVr;
   section->mvr_abs     += frame->mvr_abs;
   section->MVc        += frame->MVc;
@@ -216,6 +219,7 @@
   section->pcnt_neutral -= frame->pcnt_neutral;
   section->intra_skip_pct -= frame->intra_skip_pct;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
   section->MVr -= frame->MVr;
   section->mvr_abs     -= frame->mvr_abs;
   section->MVc        -= frame->MVc;
@@ -1050,6 +1054,7 @@
     fps.pcnt_neutral = (double)neutral_count / num_mbs;
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
+    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 0047932..49f9da3 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -53,6 +53,7 @@
   double pcnt_neutral;
   double intra_skip_pct;
   double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
   double MVr;
   double mvr_abs;
   double MVc;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3a27e89..409300b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2898,6 +2898,47 @@
   *this_rd += (*this_rd * var_factor) / 100;
 }
 
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp9_internal_image_edge(VP9_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+    ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+    (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a macro block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_edge_sb(VP9_COMP *cpi,
+  int mi_row, int mi_col) {
+  int is_active_edge = 0;
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = MAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + MI_BLOCK_SIZE))) ||
+    ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + MI_BLOCK_SIZE))) ||
+    ((left_edge >= mi_col) && (left_edge < (mi_col + MI_BLOCK_SIZE))) ||
+    ((right_edge >= mi_col) && (right_edge < (mi_col + MI_BLOCK_SIZE)))) {
+    is_active_edge = 1;
+  }
+
+  return is_active_edge;
+}
+
 void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
                                TileDataEnc *tile_data,
                                MACROBLOCK *x,
@@ -3751,13 +3792,15 @@
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int internal_active_edge =
+    vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3843,7 +3886,8 @@
       continue;
 
     // Test best rd so far against threshold for trying this mode.
-    if (rd_less_than_thresh(best_rd,
+    if (!internal_active_edge &&
+        rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
                             tile_data->thresh_freq_fact[bsize][ref_index]))
       continue;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 459b032..16a8c68 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -54,6 +54,9 @@
                                         PICK_MODE_CONTEXT *ctx,
                                         int64_t best_rd_so_far);
 
+int vp9_internal_image_edge(struct VP9_COMP *cpi);
+int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
+
 void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
                                    struct TileDataEnc *tile_data,
                                    struct macroblock *x,
@@ -61,6 +64,7 @@
                                    struct RD_COST *rd_cost,
                                    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 6c6c4ed..b3e5a0e 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -12,6 +12,8 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_rdopt.h"
+
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -90,7 +92,8 @@
   // If this is a two pass clip that fits the criteria for animated or
   // graphics content then reset disable_split_mask for speeds 1-4.
   if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
-      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)) {
+      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+       (vp9_internal_image_edge(cpi)))) {
     sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
   }
 
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2e43c27..bcab558 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -75,6 +75,7 @@
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm