Merge "SSE2 based h_predictor_32x32"
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 6f7180d..ae5ba18 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -29,11 +29,14 @@
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
-TARGETS="arm64-darwin-gcc
-         armv7-darwin-gcc
-         armv7s-darwin-gcc
-         x86-iphonesimulator-gcc
-         x86_64-iphonesimulator-gcc"
+ARM_TARGETS="arm64-darwin-gcc
+             armv7-darwin-gcc
+             armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+             x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin15-gcc
+             x86_64-darwin15-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
 
 # Configures for the target specified by $1, and invokes make with the dist
 # target using $DIST_DIR as the distribution output directory.
@@ -197,15 +200,27 @@
   fi
 }
 
+print_list() {
+  local indent="$1"
+  shift
+  local list="$@"
+  for entry in ${list}; do
+    echo "${indent}${entry}"
+  done
+}
+
 iosbuild_usage() {
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
     --extra-configure-args <args>: Extra args to pass when configuring libvpx.
+    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+              and x86_64. Allows linking to framework when builds target MacOSX
+              instead of iOS.
     --preserve-build-output: Do not delete the build directory.
     --show-build-output: Show output from each library build.
     --targets <targets>: Override default target list. Defaults:
-         ${TARGETS}
+$(print_list "        " ${TARGETS})
     --test-link: Confirms all targets can be linked. Functionally identical to
                  passing --enable-examples via --extra-configure-args.
     --verbose: Output information about the environment and each stage of the
@@ -249,6 +264,9 @@
       TARGETS="$2"
       shift
       ;;
+    --macosx)
+      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+      ;;
     --verbose)
       VERBOSE=yes
       ;;
@@ -273,10 +291,12 @@
   MAKEFLAGS=${MAKEFLAGS}
   ORIG_PWD=${ORIG_PWD}
   PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
-  TARGETS="${TARGETS}"
+  TARGETS="$(print_list "" ${TARGETS})"
+  OSX_TARGETS="${OSX_TARGETS}"
+  SIM_TARGETS="${SIM_TARGETS}"
 EOF
 fi
 
 build_framework "${TARGETS}"
 echo "Successfully built '${FRAMEWORK_DIR}' for:"
-echo "         ${TARGETS}"
+print_list "" ${TARGETS}
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index be59de3..e7d3fa5 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -119,7 +119,7 @@
 %if ABI_IS_32BIT
     %if CONFIG_PIC=1
         %ifidn __OUTPUT_FORMAT__,elf32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
             %define WRT_PLT wrt ..plt
             %macro GET_GOT 1
                 extern _GLOBAL_OFFSET_TABLE_
@@ -138,7 +138,7 @@
                 %define RESTORE_GOT pop %1
             %endmacro
         %elifidn __OUTPUT_FORMAT__,macho32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
             %macro GET_GOT 1
                 push %1
                 call %%get_got
@@ -149,6 +149,8 @@
                 %undef RESTORE_GOT
                 %define RESTORE_GOT pop %1
             %endmacro
+        %else
+            %define GET_GOT_DEFINED 0
         %endif
     %endif
 
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index b852a65..e9e3949 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -262,7 +262,7 @@
 }
 
 #if CONFIG_MISC_FIXES
-static inline void memset16(uint16_t *dst, int val, int n) {
+static INLINE void memset16(uint16_t *dst, int val, int n) {
   while (n--)
     *dst++ = val;
 }
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 80f1778..70d012b 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -1152,16 +1152,12 @@
   cm->uv_dc_delta_q = read_delta_q(rb);
   cm->uv_ac_delta_q = read_delta_q(rb);
   cm->dequant_bit_depth = cm->bit_depth;
-  for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
-    const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
-    xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
-                      qindex == 0 &&
-#else
-                      cm->base_qindex == 0 &&
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+                       cm->base_qindex;
+    xd->lossless[i] = qindex == 0 &&
+                      cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 &&
                       cm->uv_ac_delta_q == 0;
   }
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 32275d4..5f6d9d3 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1473,7 +1473,7 @@
     assert(n_log2_tiles > 0);
     vpx_wb_write_literal(&saved_wb, mag, 2);
     if (mag < 3)
-      data_sz = remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
+      data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
   } else {
     assert(n_log2_tiles == 0);
   }
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index bcdcff3..44ca276 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -1155,7 +1155,7 @@
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     x->source_variance =
         vp10_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
-                                           bsize, xd->bd);
+                                            bsize, xd->bd);
   } else {
     x->source_variance =
       vp10_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
@@ -2579,7 +2579,7 @@
 }
 
 static TX_MODE select_tx_mode(const VP10_COMP *cpi, MACROBLOCKD *const xd) {
-  if (!cpi->common.seg.enabled && xd->lossless[0])
+  if (xd->lossless[0])
     return ONLY_4X4;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32;
@@ -2702,16 +2702,12 @@
   rdc->m_search_count = 0;   // Count of motion search hits.
   rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
-  for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
-    const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
-    xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
-                      qindex == 0 &&
-#else
-                      cm->base_qindex == 0 &&
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+                       cm->base_qindex;
+    xd->lossless[i] = qindex == 0 &&
+                      cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 &&
                       cm->uv_ac_delta_q == 0;
   }
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 16f9c85..015dbc0 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -135,15 +135,38 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier  *= modifier;
-      modifier  *= 3;
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
       modifier  += rounding;
       modifier >>= strength;
 
@@ -182,15 +205,34 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier *= modifier;
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
       modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
       modifier += rounding;
       modifier >>= strength;
 
@@ -383,55 +425,58 @@
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
             // Apply the filter (YUV)
-            vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
-                                             f->y_stride,
-                                             predictor, 16, 16, adj_strength,
-                                             filter_weight,
-                                             accumulator, count);
-            vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 256,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength,
-                                             filter_weight, accumulator + 256,
-                                             count + 256);
-            vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 512,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength, filter_weight,
-                                             accumulator + 512, count + 512);
+            vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset,
+                                               f->y_stride,
+                                               predictor, 16, 16, adj_strength,
+                                               filter_weight,
+                                               accumulator, count);
+            vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 256,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength,
+                                               filter_weight, accumulator + 256,
+                                               count + 256);
+            vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 512,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength, filter_weight,
+                                               accumulator + 512, count + 512);
           } else {
             // Apply the filter (YUV)
-            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+            vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                        predictor, 16, 16,
+                                        strength, filter_weight,
+                                        accumulator, count);
+            vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 256,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 256,
+                                        count + 256);
+            vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 512,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 512,
+                                        count + 512);
+          }
+#else
+          // Apply the filter (YUV)
+          // TODO(jingning): Need SIMD optimization for this.
+          vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                       predictor, 16, 16,
                                       strength, filter_weight,
                                       accumulator, count);
-            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+          vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
                                       predictor + 256,
                                       mb_uv_width, mb_uv_height, strength,
                                       filter_weight, accumulator + 256,
                                       count + 256);
-            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+          vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
                                       predictor + 512,
                                       mb_uv_width, mb_uv_height, strength,
                                       filter_weight, accumulator + 512,
                                       count + 512);
-          }
-#else
-          // Apply the filter (YUV)
-          vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                    predictor, 16, 16,
-                                    strength, filter_weight,
-                                    accumulator, count);
-          vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 256,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 256,
-                                    count + 256);
-          vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 512,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 512,
-                                    count + 512);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 5f9c963..22d52a2 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -123,7 +123,10 @@
       %define sec_str sec_stridemp
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -131,7 +134,6 @@
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
@@ -140,7 +142,10 @@
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -148,7 +153,6 @@
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 4aca3ec..f27b8d9 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -47,9 +47,9 @@
 
 INIT_XMM sse2
 cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
-  movifnidn          leftq, leftmp
   pxor                  m1, m1
   movd                  m0, [leftq]
   psadbw                m0, m1
@@ -143,9 +143,9 @@
 
 INIT_XMM sse2
 cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
-  movifnidn          leftq, leftmp
   pxor                  m1, m1
   movq                  m0, [leftq]
   DEFINE_ARGS dst, stride, stride3
@@ -239,14 +239,11 @@
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
@@ -271,14 +268,11 @@
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [leftq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 1176a2f..c655e4b 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -139,7 +139,10 @@
       %define sec_str sec_stridemp
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -147,7 +150,6 @@
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
@@ -156,7 +158,10 @@
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -164,7 +169,6 @@
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index c94b76a..708fa10 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -189,7 +189,6 @@
 %if ABI_IS_32BIT
   %if CONFIG_PIC=1
   %ifidn __OUTPUT_FORMAT__,elf32
-    %define GET_GOT_SAVE_ARG 1
     %define WRT_PLT wrt ..plt
     %macro GET_GOT 1
       extern _GLOBAL_OFFSET_TABLE_
@@ -208,7 +207,6 @@
       %define RESTORE_GOT pop %1
     %endmacro
   %elifidn __OUTPUT_FORMAT__,macho32
-    %define GET_GOT_SAVE_ARG 1
     %macro GET_GOT 1
       push %1
       call %%get_got