Merge "Change size on first frame and change config cause crash."
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 9c30441..c73e787 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -201,6 +201,10 @@
   eval test "x\$$1" = "xno"
 }
 
+# Iterates through positional parameters, checks to confirm the parameter has
+# not been explicitly (force) disabled, and enables the setting controlled by
+# the parameter when the setting is not disabled.
+# Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
 soft_enable() {
   for var in $*; do
     if ! disabled $var; then
@@ -210,6 +214,10 @@
   done
 }
 
+# Iterates through positional parameters, checks to confirm the parameter has
+# not been explicitly (force) enabled, and disables the setting controlled by
+# the parameter when the setting is not enabled.
+# Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
 soft_disable() {
   for var in $*; do
     if ! enabled $var; then
@@ -625,6 +633,11 @@
     xcodebuild -sdk $1 -version Path 2>/dev/null
 }
 
+# Print the major version number of the Darwin SDK specified by $1.
+show_darwin_sdk_major_version() {
+  xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
+}
+
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@@ -736,7 +749,15 @@
   # Handle darwin variants. Newer SDKs allow targeting older
   # platforms, so use the newest one available.
   case ${toolchain} in
-    *-darwin*)
+    arm*-darwin*)
+      add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)"
+      if [ -d "${iphoneos_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${iphoneos_sdk_dir}"
+        add_ldflags "-isysroot ${iphoneos_sdk_dir}"
+      fi
+      ;;
+    x86*-darwin*)
       osx_sdk_dir="$(show_darwin_sdk_path macosx)"
       if [ -d "${osx_sdk_dir}" ]; then
         add_cflags  "-isysroot ${osx_sdk_dir}"
@@ -811,10 +832,36 @@
           if disabled neon && enabled neon_asm; then
             die "Disabling neon while keeping neon-asm is not supported"
           fi
-          soft_enable media
+          case ${toolchain} in
+            # Apple iOS SDKs no longer support armv6 as of the version 9
+            # release (coincides with release of Xcode 7). Only enable media
+            # when using earlier SDK releases.
+            *-darwin*)
+              if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
+                soft_enable media
+              else
+                soft_disable media
+                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-media "
+              fi
+              ;;
+            *)
+              soft_enable media
+              ;;
+          esac
           ;;
         armv6)
-          soft_enable media
+          case ${toolchain} in
+            *-darwin*)
+              if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
+                soft_enable media
+              else
+                die "Your iOS SDK does not support armv6."
+              fi
+              ;;
+            *)
+              soft_enable media
+              ;;
+          esac
           ;;
       esac
 
diff --git a/test/test.mk b/test/test.mk
index bb5186b..fde97031 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -168,7 +168,7 @@
 TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
 
 ## VP10
-LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_dct_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
 
 endif # CONFIG_SHARED
diff --git a/test/vp10_dct_test.cc b/test/vp10_dct_test.cc
index 8e49609..8fb5f4f 100644
--- a/test/vp10_dct_test.cc
+++ b/test/vp10_dct_test.cc
@@ -17,10 +17,7 @@
 #include "test/util.h"
 #include "./vpx_config.h"
 #include "vpx_ports/msvc.h"
-
-#undef CONFIG_COEFFICIENT_RANGE_CHECKING
-#define CONFIG_COEFFICIENT_RANGE_CHECKING 1
-#include "vp10/encoder/dct.c"
+#include "vp10/encoder/dct.h"
 
 using libvpx_test::ACMRandom;
 
@@ -105,8 +102,8 @@
 INSTANTIATE_TEST_CASE_P(
     C, Vp10FwdTxfm,
     ::testing::Values(
-        FdctParam(&fdct4, &reference_dct_1d, 4, 1),
-        FdctParam(&fdct8, &reference_dct_1d, 8, 1),
-        FdctParam(&fdct16, &reference_dct_1d, 16, 2),
-        FdctParam(&fdct32, &reference_dct_1d, 32, 4)));
+        FdctParam(&vp10_fdct4, &reference_dct_1d, 4, 1),
+        FdctParam(&vp10_fdct8, &reference_dct_1d, 8, 1),
+        FdctParam(&vp10_fdct16, &reference_dct_1d, 16, 2),
+        FdctParam(&vp10_fdct32_local, &reference_dct_1d, 32, 4)));
 }  // namespace
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 80ace06..78f151e 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -34,7 +34,7 @@
 #endif
 }
 
-static void fdct4(const tran_low_t *input, tran_low_t *output) {
+void vp10_fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[4];
 
@@ -70,7 +70,7 @@
   range_check(output, 4, 13);
 }
 
-static void fdct8(const tran_low_t *input, tran_low_t *output) {
+void vp10_fdct8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[8];
 
@@ -148,7 +148,7 @@
   range_check(output, 8, 14);
 }
 
-static void fdct16(const tran_low_t *input, tran_low_t *output) {
+void vp10_fdct16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[16];
 
@@ -322,7 +322,8 @@
   range_check(output, 16, 16);
 }
 
-static void fdct32(const tran_low_t *input, tran_low_t *output) {
+// TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
+void vp10_fdct32_local(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[32];
 
@@ -995,24 +996,24 @@
 }
 
 static const transform_2d FHT_4[] = {
-  { fdct4,  fdct4  },  // DCT_DCT  = 0
-  { fadst4, fdct4  },  // ADST_DCT = 1
-  { fdct4,  fadst4 },  // DCT_ADST = 2
-  { fadst4, fadst4 }   // ADST_ADST = 3
+  { vp10_fdct4, vp10_fdct4 },  // DCT_DCT  = 0
+  { fadst4,     vp10_fdct4 },  // ADST_DCT = 1
+  { vp10_fdct4, fadst4     },  // DCT_ADST = 2
+  { fadst4,     fadst4     }   // ADST_ADST = 3
 };
 
 static const transform_2d FHT_8[] = {
-  { fdct8,  fdct8  },  // DCT_DCT  = 0
-  { fadst8, fdct8  },  // ADST_DCT = 1
-  { fdct8,  fadst8 },  // DCT_ADST = 2
-  { fadst8, fadst8 }   // ADST_ADST = 3
+  { vp10_fdct8, vp10_fdct8 },  // DCT_DCT  = 0
+  { fadst8,     vp10_fdct8 },  // ADST_DCT = 1
+  { vp10_fdct8, fadst8     },  // DCT_ADST = 2
+  { fadst8,     fadst8     }   // ADST_ADST = 3
 };
 
 static const transform_2d FHT_16[] = {
-  { fdct16,  fdct16  },  // DCT_DCT  = 0
-  { fadst16, fdct16  },  // ADST_DCT = 1
-  { fdct16,  fadst16 },  // DCT_ADST = 2
-  { fadst16, fadst16 }   // ADST_ADST = 3
+  { vp10_fdct16, vp10_fdct16 },  // DCT_DCT  = 0
+  { fadst16,     vp10_fdct16 },  // ADST_DCT = 1
+  { vp10_fdct16, fadst16     },  // DCT_ADST = 2
+  { fadst16,     fadst16     }   // ADST_ADST = 3
 };
 
 void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
@@ -1123,7 +1124,7 @@
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+    vp10_fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
     for (j = 0; j < 8; ++j)
       coeff_ptr[j + i * 8] /= 2;
   }
diff --git a/vp10/encoder/dct.h b/vp10/encoder/dct.h
new file mode 100644
index 0000000..ab0db93
--- /dev/null
+++ b/vp10/encoder/dct.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP10_ENCODER_DCT_H_
+#define VP10_ENCODER_DCT_H_
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_fdct4(const tran_low_t *input, tran_low_t *output);
+void vp10_fdct8(const tran_low_t *input, tran_low_t *output);
+void vp10_fdct16(const tran_low_t *input, tran_low_t *output);
+void vp10_fdct32_local(const tran_low_t *input, tran_low_t *output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_DCT_H_
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index ea527b1..20b7d50 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -653,6 +653,7 @@
     } else if (s[n]) {
       if (is_inter_block(mbmi)) {
         rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+        r[n][1] -= r_tx_size;
       } else {
         rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
         rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]);
@@ -662,6 +663,11 @@
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
 
+    if (is_inter_block(mbmi) && !xd->lossless && !s[n] && sse[n] != INT64_MAX) {
+      rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+      rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+    }
+
     // Early termination in transform size search.
     if (cpi->sf.tx_size_search_breakout &&
         (rd[n][1] == INT64_MAX ||
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index ead993a..7393a4e 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -23,6 +23,7 @@
 VP10_CX_SRCS-yes += encoder/context_tree.h
 VP10_CX_SRCS-yes += encoder/cost.h
 VP10_CX_SRCS-yes += encoder/cost.c
+VP10_CX_SRCS-yes += encoder/dct.h
 VP10_CX_SRCS-yes += encoder/dct.c
 VP10_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/denoiser.c
 VP10_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/denoiser.h
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c8ef367..5683736 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -176,7 +176,6 @@
   int mb_to_bottom_edge;
 
   FRAME_CONTEXT *fc;
-  int frame_parallel_decoding_mode;
 
   /* pointers to reference frames */
   RefBuffer *block_refs[2];
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 5d8eb90..61e731a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -364,7 +364,6 @@
       memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
     }
     xd->fc = cm->fc;
-    xd->frame_parallel_decoding_mode = cm->frame_parallel_decoding_mode;
   }
 
   xd->above_seg_context = cm->above_seg_context;
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index e17b397..bb1e179 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -447,7 +447,7 @@
     cr->rate_boost_fac = 10;
   } else {
     cr->motion_thresh = 32;
-    cr->rate_boost_fac = 17;
+    cr->rate_boost_fac = 15;
   }
   if (cpi->svc.spatial_layer_id > 0) {
     cr->motion_thresh = 4;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 19b0beb..fc4d9ae 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1238,10 +1238,12 @@
     if (const_motion[ref_frame] && this_mode == NEARMV)
       continue;
 
-    i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-    if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
-      if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
-        ref_frame_skip_mask |= (1 << ref_frame);
+    if (!(this_mode == ZEROMV && ref_frame == LAST_FRAME)) {
+      i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+      if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
+        if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+          ref_frame_skip_mask |= (1 << ref_frame);
+    }
     if (ref_frame_skip_mask & (1 << ref_frame))
       continue;
 
@@ -1530,11 +1532,13 @@
       (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
        bsize <= cpi->sf.max_intra_bsize)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
-    const TX_SIZE intra_tx_size =
-        VPXMIN(max_txsize_lookup[bsize],
-               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     int i;
     TX_SIZE best_intra_tx_size = TX_SIZES;
+    TX_SIZE intra_tx_size =
+        VPXMIN(max_txsize_lookup[bsize],
+               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && intra_tx_size > TX_16X16)
+      intra_tx_size = TX_16X16;
 
     if (reuse_inter_pred && best_pred != NULL) {
       if (best_pred->data == orig_dst.buf) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3c84a77..0bffcba 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -664,6 +664,7 @@
     } else if (s[n]) {
       if (is_inter_block(mbmi)) {
         rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+        r[n][1] -= r_tx_size;
       } else {
         rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
         rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]);
@@ -673,6 +674,11 @@
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
 
+    if (is_inter_block(mbmi) && !xd->lossless && !s[n] && sse[n] != INT64_MAX) {
+      rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+      rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+    }
+
     // Early termination in transform size search.
     if (cpi->sf.tx_size_search_breakout &&
         (rd[n][1] == INT64_MAX ||
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 8d5c7c2..6fd5208 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -291,123 +291,6 @@
   }
 }
 
-#if ARCH_X86_64
-static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
-                                                 ptrdiff_t src_pitch,
-                                                 uint8_t *output_ptr,
-                                                 ptrdiff_t out_pitch,
-                                                 uint32_t output_height,
-                                                 const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  // load the first 7 rows of 16 bytes
-  srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-
-  for (i = 0; i < output_height; i++) {
-    // load the last 16 bytes
-    srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the result together
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-    srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
-    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-    // merge the result together
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-    srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
-    // merge the result together
-    srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
-    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
-    src_ptr+=src_pitch;
-
-    // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
-
-    // save 16 bytes convolve result
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
-    output_ptr+=out_pitch;
-  }
-}
-#endif  // ARCH_X86_64
-
 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
diff --git a/vpx_ports/bitops.h b/vpx_ports/bitops.h
index 0d3223e..84ff365 100644
--- a/vpx_ports/bitops.h
+++ b/vpx_ports/bitops.h
@@ -11,6 +11,8 @@
 #ifndef VPX_PORTS_BITOPS_H_
 #define VPX_PORTS_BITOPS_H_
 
+#include <assert.h>
+
 #include "vpx_ports/msvc.h"
 
 #ifdef _MSC_VER
@@ -25,10 +27,15 @@
 extern "C" {
 #endif
 
+// These versions of get_msb() are only valid when n != 0 because all
+// of the optimized versions are undefined when n == 0:
+// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
 static INLINE int get_msb(unsigned int n) {
+  assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
 #elif defined(USE_MSC_INTRINSICS)
@@ -36,6 +43,7 @@
 
 static INLINE int get_msb(unsigned int n) {
   unsigned long first_set_bit;
+  assert(n != 0);
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
@@ -47,6 +55,8 @@
   unsigned int value = n;
   int i;
 
+  assert(n != 0);
+
   for (i = 4; i >= 0; --i) {
     const int shift = (1 << i);
     const unsigned int x = value >> shift;