Merge changes I2965e786,I144bedde

* changes:
  vpx_memset16: drop unnecessary local
  vpx_memset16: quiet signed/unsigned warning
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 0d14ad8..4404701 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -87,65 +87,127 @@
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-  specialize qw/vp10_iht4x4_16_add/;
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x4_16_add/;
 
-  add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-  specialize qw/vp10_iht8x8_64_add/;
+    add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x8_64_add/;
 
-  add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-  specialize qw/vp10_iht16x16_256_add/;
+    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp10_iht16x16_256_add/;
 
-  add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct4x4 sse2/;
+    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4/;
 
-  add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct4x4_1 sse2/;
+    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4_1/;
 
-  add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct8x8 sse2/;
+    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8/;
 
-  add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct8x8_1 sse2/;
+    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8_1/;
 
-  add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct16x16 sse2/;
+    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16/;
 
-  add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct16x16_1 sse2/;
+    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16_1/;
 
-  add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct32x32 sse2/;
+    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32/;
 
-  add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct32x32_rd sse2/;
+    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_rd/;
 
-  add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fdct32x32_1 sse2/;
+    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_1/;
 
-  add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_highbd_fdct4x4 sse2/;
+    add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct4x4/;
 
-  add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_highbd_fdct8x8 sse2/;
+    add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct8x8/;
 
-  add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_highbd_fdct8x8_1/;
+    add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct8x8_1/;
 
-  add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_highbd_fdct16x16 sse2/;
+    add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct16x16/;
 
-  add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_highbd_fdct16x16_1/;
+    add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct16x16_1/;
 
-  add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_highbd_fdct32x32 sse2/;
+    add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32/;
 
-  add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_highbd_fdct32x32_rd sse2/;
+    add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32_rd/;
 
-  add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_highbd_fdct32x32_1/;
+    add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32_1/;
+  } else {
+    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x4_16_add sse2/;
+
+    add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x8_64_add sse2/;
+
+    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp10_iht16x16_256_add/;
+
+    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4 sse2/;
+
+    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4_1 sse2/;
+
+    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8 sse2/;
+
+    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8_1 sse2/;
+
+    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16 sse2/;
+
+    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16_1 sse2/;
+
+    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32 sse2/;
+
+    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_rd sse2/;
+
+    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_1 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct4x4 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct8x8 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct8x8_1/;
+
+    add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct16x16 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct16x16_1/;
+
+    add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32_rd sse2/;
+
+    add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32_1/;
+  }
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
   if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
diff --git a/vp10/common/x86/idct_intrin_sse2.c b/vp10/common/x86/idct_intrin_sse2.c
index d58e263..a2c674b 100644
--- a/vp10/common/x86/idct_intrin_sse2.c
+++ b/vp10/common/x86/idct_intrin_sse2.c
@@ -12,14 +12,14 @@
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
-void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
-                            int tx_type) {
+void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
   __m128i in[2];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = _mm_loadu_si128((const __m128i *)(input));
-  in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -77,21 +77,21 @@
   }
 }
 
-void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
-                            int tx_type) {
+void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
   // load input data
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8 * 1);
+  in[2] = load_input_data(input + 8 * 2);
+  in[3] = load_input_data(input + 8 * 3);
+  in[4] = load_input_data(input + 8 * 4);
+  in[5] = load_input_data(input + 8 * 5);
+  in[6] = load_input_data(input + 8 * 6);
+  in[7] = load_input_data(input + 8 * 7);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -144,8 +144,8 @@
   RECON_AND_STORE(dest + 7 * stride, in[7]);
 }
 
-void vp10_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {
+void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                int stride, int tx_type) {
   __m128i in0[16], in1[16];
 
   load_buffer_8x16(input, in0);
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 968dad2..04fa8f6 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -395,6 +395,7 @@
   unsigned char *const seg_map = cpi->segmentation_map;
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
+  int consec_zero_mv_thresh = 0;
   memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
   sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
   sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
@@ -407,6 +408,9 @@
   assert(cr->sb_index < sbs_in_frame);
   i = cr->sb_index;
   cr->target_num_seg_blocks = 0;
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cr->percent_refresh > 0)
+    consec_zero_mv_thresh = 10 * (100 / cr->percent_refresh);
   do {
     int sum_map = 0;
     // Get the mi_row/mi_col corresponding to superblock index i.
@@ -416,9 +420,6 @@
     int mi_col = sb_col_index * MI_BLOCK_SIZE;
     int qindex_thresh =
         vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex);
-    int consec_zero_mv_thresh =
-        cpi->oxcf.content == VP9E_CONTENT_SCREEN ? 0
-        : 10 * (100 / cr->percent_refresh);
     assert(mi_row >= 0 && mi_row < cm->mi_rows);
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
     bl_index = mi_row * cm->mi_cols + mi_col;