Merge "mips msa vp9 fdct 4x4 optimization"

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 352cde2..7b4c435 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc

@@ -264,8 +264,8 @@
 INTRA_PRED_TEST(NEON, TestIntraPred8, vp9_dc_predictor_8x8_neon,
                 vp9_dc_left_predictor_8x8_neon, vp9_dc_top_predictor_8x8_neon,
                 vp9_dc_128_predictor_8x8_neon, vp9_v_predictor_8x8_neon,
-                vp9_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                vp9_tm_predictor_8x8_neon)
+                vp9_h_predictor_8x8_neon, vp9_d45_predictor_8x8_neon, NULL,
+                NULL, NULL, NULL, NULL, vp9_tm_predictor_8x8_neon)
 
 #endif  // HAVE_NEON
 

diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index 13c46a5..cfd5905 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c

@@ -338,6 +338,26 @@
   dst[3 * stride + 3] = above[7];
 }
 
+void vp9_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
+  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
+  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
+  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
+  const uint8x8_t A0 = vld1_u8(above);  // top row
+  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
+  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+  const uint8x8_t avg1 = vhadd_u8(A0, A2);
+  uint8x8_t row = vrhadd_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 7; ++i) {
+    vst1_u8(dst + i * stride, row);
+    row = vtbl1_u8(row, sh_12345677);
+  }
+  vst1_u8(dst + i * stride, row);
+}
+
 // -----------------------------------------------------------------------------
 
 void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index a984c3e..63eb0cb 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -99,7 +99,7 @@
 specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_8x8/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_8x8 neon/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index dcddefc..b405975 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -2885,8 +2885,7 @@
   if (xd->lossless)
     return ONLY_4X4;
   if (cpi->common.frame_type == KEY_FRAME &&
-      cpi->sf.use_nonrd_pick_mode &&
-      cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+      cpi->sf.use_nonrd_pick_mode)
     return ALLOW_16X16;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32;
@@ -3572,15 +3571,26 @@
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
         if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
             xd->mi[0]->mbmi.segment_id) {
-          x->max_partition_size = BLOCK_64X64;
+          // Use lower max_partition_size for low resoultions.
+          if (cm->width <= 352 && cm->height <= 288)
+            x->max_partition_size = BLOCK_32X32;
+          else
+            x->max_partition_size = BLOCK_64X64;
           x->min_partition_size = BLOCK_8X8;
           nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
                                BLOCK_64X64, &dummy_rdc, 1,
                                INT64_MAX, td->pc_root);
         } else {
           choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
-          nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                                 BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+          // TODO(marpan): Seems like nonrd_select_partition does not support
+          // 4x4 partition. Since 4x4 is used on key frame, use this switch
+          // for now.
+          if (cm->frame_type == KEY_FRAME)
+            nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                                BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+          else
+            nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                                   BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         }
 
         break;