Merge "Add neon optimize vp9_short_idct4x4_1_add."
diff --git a/test/md5_helper.h b/test/md5_helper.h
index fc1a974..f34054d 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -25,9 +25,15 @@
 
   void Add(const vpx_image_t *img) {
     for (int plane = 0; plane < 3; ++plane) {
-      uint8_t *buf = img->planes[plane];
-      const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
-      const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+      const uint8_t *buf = img->planes[plane];
+      // Calculate the width and height to do the md5 check. For the chroma
+      // plane, we never want to round down and thus skip a pixel so if
+      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+      // This works only for chroma_shift of 0 and 1.
+      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
+                    img->y_chroma_shift : img->d_h;
+      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
+                    img->x_chroma_shift : img->d_w;
 
       for (int y = 0; y < h; ++y) {
         MD5Update(&md5_, buf, w);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 15b0bc6..e15c44d 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -735,7 +735,8 @@
   sf->mode_search_skip_flags = 0;
   sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
-  sf->last_chroma_intra_mode = TM_PRED;
+  sf->intra_y_mode_mask = ALL_INTRA_MODES;
+  sf->intra_uv_mode_mask = ALL_INTRA_MODES;
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
   sf->use_uv_intra_rd_estimate = 0;
@@ -765,7 +766,7 @@
       sf->static_segmentation = 0;
 #endif
       sf->use_avoid_tested_higherror = 1;
-      sf->adaptive_rd_thresh = 1;
+      sf->adaptive_rd_thresh = MIN((speed + 1), 4);
 
       if (speed == 1) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -798,6 +799,9 @@
         // the main framework of partition search type.
         sf->disable_split_var_thresh = 0;
         sf->disable_filter_search_var_thresh = 16;
+
+        sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
+        sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
       }
       if (speed == 2) {
         sf->adjust_thresholds_by_speed = 1;
@@ -819,7 +823,8 @@
                                      FLAG_SKIP_COMP_REFMISMATCH |
                                      FLAG_SKIP_INTRA_LOWVAR |
                                      FLAG_EARLY_TERMINATE;
-        sf->last_chroma_intra_mode = DC_PRED;
+        sf->intra_y_mode_mask = INTRA_DC_TM;
+        sf->intra_uv_mode_mask = INTRA_DC_TM;
         sf->use_uv_intra_rd_estimate = 1;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
@@ -859,6 +864,8 @@
         sf->subpel_iters_per_step = 1;
         sf->disable_split_var_thresh = 64;
         sf->disable_filter_search_var_thresh = 64;
+        sf->intra_y_mode_mask = INTRA_DC_ONLY;
+        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
       }
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -1395,7 +1402,7 @@
 }
 
 VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
-  int i;
+  int i, j;
   volatile union {
     VP9_COMP *cpi;
     VP9_PTR   ptr;
@@ -1597,9 +1604,10 @@
 
   vp9_set_speed_features(cpi);
 
-  // Set starting values of RD threshold multipliers (128 = *1)
-  for (i = 0; i < MAX_MODES; i++)
-    cpi->rd_thresh_mult[i] = 128;
+  // Default rd threshold factors for mode selection
+  for (i = 0; i < BLOCK_SIZES; ++i)
+    for (j = 0; j < MAX_MODES; ++j)
+      cpi->rd_thresh_freq_fact[i][j] = 32;
 
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
             SDX3F, SDX8F, SDX4DF)\
@@ -2629,8 +2637,6 @@
 
   // Set various flags etc to special state if it is a key frame
   if (cm->frame_type == KEY_FRAME) {
-    int i;
-
     // Reset the loop filter deltas and segmentation map
     setup_features(cm);
 
@@ -2643,10 +2649,6 @@
     // The alternate reference frame cannot be active for a key frame
     cpi->source_alt_ref_active = 0;
 
-    // Reset the RD threshold multipliers to default of * 1 (128)
-    for (i = 0; i < MAX_MODES; i++)
-      cpi->rd_thresh_mult[i] = 128;
-
     cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
     cm->frame_parallel_decoding_mode =
       (cpi->oxcf.frame_parallel_decoding_mode != 0);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 1c0b2ca..8328374 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -238,6 +238,11 @@
   // Other methods to come
 } SUBPEL_SEARCH_METHODS;
 
+#define ALL_INTRA_MODES 0x3FF
+#define INTRA_DC_ONLY 0x01
+#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
+
 typedef struct {
   int RD;
   SEARCH_METHODS search_method;
@@ -288,7 +293,8 @@
   // A source variance threshold below which filter search is disabled
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
-  MB_PREDICTION_MODE last_chroma_intra_mode;
+  int intra_y_mode_mask;
+  int intra_uv_mode_mask;
   int use_rd_breakout;
   int use_uv_intra_rd_estimate;
   int use_fast_lpf_pick;
@@ -375,8 +381,6 @@
   int ref_frame_mask;
   int set_ref_frame_mask;
 
-  int rd_thresh_mult[MAX_MODES];
-  int rd_baseline_thresh[BLOCK_SIZES][MAX_MODES];
   int rd_threshes[BLOCK_SIZES][MAX_MODES];
   int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 567d3d0..ee21957 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -104,9 +104,8 @@
 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 
-#define BASE_RD_THRESH_FREQ_FACT 16
-#define MAX_RD_THRESH_FREQ_FACT 32
-#define MAX_RD_THRESH_FREQ_INC 1
+#define MAX_RD_THRESH_FACT 64
+#define RD_THRESH_INC 1
 
 static void fill_token_costs(vp9_coeff_cost *c,
                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
@@ -212,12 +211,6 @@
         } else {
           cpi->rd_threshes[bsize][i] = INT_MAX;
         }
-        cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-
-        if (cpi->sf.adaptive_rd_thresh)
-          cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
-        else
-          cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
     }
   } else {
@@ -236,12 +229,6 @@
         } else {
           cpi->rd_threshes[bsize][i] = INT_MAX;
         }
-        cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-
-        if (cpi->sf.adaptive_rd_thresh)
-          cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
-        else
-          cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
     }
   }
@@ -1043,6 +1030,10 @@
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
+
+    if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+      continue;
+
     // Only do the oblique modes if the best so far is
     // one of the neighboring directional modes
     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
@@ -1228,6 +1219,9 @@
     int64_t local_tx_cache[TX_MODES];
     const int mis = xd->mode_info_stride;
 
+    if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+      continue;
+
     if (cpi->common.frame_type == KEY_FRAME) {
       const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
       const MB_PREDICTION_MODE L = xd->left_available ?
@@ -1325,10 +1319,14 @@
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 
-  MB_PREDICTION_MODE last_mode = bsize <= BLOCK_8X8 ?
-              TM_PRED : cpi->sf.last_chroma_intra_mode;
+  // int mode_mask = (bsize <= BLOCK_8X8)
+  //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
 
-  for (mode = DC_PRED; mode <= last_mode; mode++) {
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    // if (!(mode_mask & (1 << mode)))
+    if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
+      continue;
+
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
@@ -3216,7 +3214,7 @@
 
     // Test best rd so far against threshold for trying this mode.
     if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
-                     cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
+                     cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) ||
         cpi->rd_threshes[bsize][mode_index] == INT_MAX)
       continue;
 
@@ -3777,29 +3775,6 @@
           }
         }
       }
-#if 0
-      // Testing this mode gave rise to an improvement in best error score.
-      // Lower threshold a bit for next time
-      cpi->rd_thresh_mult[mode_index] =
-          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-              cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-      cpi->rd_threshes[mode_index] =
-          (cpi->rd_baseline_thresh[mode_index] >> 7)
-              * cpi->rd_thresh_mult[mode_index];
-#endif
-    } else {
-      // If the mode did not help improve the best error case then
-      // raise the threshold for testing that mode next time around.
-#if 0
-      cpi->rd_thresh_mult[mode_index] += 4;
-
-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-      cpi->rd_threshes[mode_index] =
-          (cpi->rd_baseline_thresh[mode_index] >> 7)
-              * cpi->rd_thresh_mult[mode_index];
-#endif
     }
 
     /* keep record of best compound/single-only prediction */
@@ -3942,33 +3917,19 @@
   if (cpi->sf.adaptive_rd_thresh) {
     for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
       if (mode_index == best_mode_index) {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
+        cpi->rd_thresh_freq_fact[bsize][mode_index] -=
+          (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
       } else {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
+        cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
         if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
-            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
+            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
           cpi->rd_thresh_freq_fact[bsize][mode_index] =
-            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
+            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
         }
       }
     }
   }
 
-  // TODO(rbultje) integrate with RD trd_thresh_freq_facthresholding
-#if 0
-  // Reduce the activation RD thresholds for the best choice mode
-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-    cpi->rd_thresh_mult[best_mode_index] =
-      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
-      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-    cpi->rd_threshes[best_mode_index] =
-      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
-  }
-#endif
-
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index d3dbefe..3501cf1 100644
--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -342,8 +342,8 @@
         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
 
         ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
+        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
         punpcklbw   mm1, mm6
         psubsw      mm0, mm1                    ; A-B (low order) to MM0
@@ -351,12 +351,12 @@
         pmaddwd     mm0, mm0                    ; square and accumulate
         add         rbx,rdx                     ; Inc pointer into ref data
         add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
         paddd       mm7, mm0                    ; accumulate in mm7
 
 
         ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
         punpcklbw   mm1, mm6
         psubsw      mm0, mm1                    ; A-B (low order) to MM0
@@ -365,11 +365,11 @@
         pmaddwd     mm0, mm0                    ; square and accumulate
         add         rbx,rdx                     ; Inc pointer into ref data
         add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
         paddd       mm7, mm0                    ; accumulate in mm7
 
         ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
         punpcklbw   mm1, mm6
         psubsw      mm0, mm1                    ; A-B (low order) to MM0
@@ -378,11 +378,11 @@
         pmaddwd     mm0, mm0                    ; square and accumulate
         add         rbx,rdx                     ; Inc pointer into ref data
         add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
         paddd       mm7, mm0                    ; accumulate in mm7
 
         ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
 
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
         punpcklbw   mm1, mm6