Merge "vp8-denoiser: Avoid doing the mcomp if we don't denoise."

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index d6cc5e4..c4dfaf3 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -792,6 +792,67 @@
   CompareInvReference(ref_txfm_, thresh_);
 }
 
+class PartialTrans16x16Test
+    : public ::testing::TestWithParam<
+          std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > {
+ public:
+  virtual ~PartialTrans16x16Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    bit_depth_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  FdctFunc fwd_txfm_;
+};
+
+TEST_P(PartialTrans16x16Test, Extremes) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  const int minval = -maxval;
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
+}
+
+TEST_P(PartialTrans16x16Test, Random) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  int sum = 0;
+  for (int i = 0; i < kNumCoeffs; ++i) {
+    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
+    input[i] = val;
+    sum += val;
+  }
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ(sum >> 1, output[0]);
+}
+
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -824,6 +885,11 @@
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    C, PartialTrans16x16Test,
+    ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
@@ -832,6 +898,9 @@
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_c,
+                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -859,6 +928,9 @@
                    VPX_BITS_8),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
                    VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -896,6 +968,9 @@
                    &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
         make_tuple(&idct16x16_12,
                    &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -912,5 +987,8 @@
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
                    VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 87d8366..2a9942e 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -344,6 +344,28 @@
   EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
 }
 
+TEST_P(PartialTrans32x32Test, Random) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  int sum = 0;
+  for (int i = 0; i < kNumCoeffs; ++i) {
+    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
+    input[i] = val;
+    sum += val;
+  }
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
+  EXPECT_EQ(sum >> 3, output[0]);
+}
+
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 6f50f78..a9ca07c 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -74,6 +74,10 @@
   return res;
 }
 
+/* Note:
+ *  Our codebase calculates the "diff" value in the variance algorithm by
+ *  (src - ref).
+ */
 static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref,
                              int l2w, int l2h, int src_stride_coeff,
                              int ref_stride_coeff, uint32_t *sse_ptr,
@@ -87,14 +91,14 @@
     for (int x = 0; x < w; x++) {
       int diff;
       if (!use_high_bit_depth_) {
-        diff = ref[w * y * ref_stride_coeff + x] -
-               src[w * y * src_stride_coeff + x];
+        diff = src[w * y * src_stride_coeff + x] -
+               ref[w * y * ref_stride_coeff + x];
         se += diff;
         sse += diff * diff;
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
-               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+        diff = CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x] -
+               CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x];
         se += diff;
         sse += diff * diff;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -309,15 +313,15 @@
 void VarianceTest<VarianceFunctionType>::RefTest() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size_; j++) {
-    if (!use_high_bit_depth_) {
-      src_[j] = rnd_.Rand8();
-      ref_[j] = rnd_.Rand8();
+      if (!use_high_bit_depth_) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
 #if CONFIG_VP9_HIGHBITDEPTH
-    } else {
-      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
-      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
+      }
     }
     unsigned int sse1, sse2;
     unsigned int var1;
@@ -328,8 +332,10 @@
                                            log2height_, stride_coeff,
                                            stride_coeff, &sse2,
                                            use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    EXPECT_EQ(sse1, sse2)
+        << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2)
+        << "Error at test index: " << i;
   }
 }
 
@@ -346,8 +352,8 @@
         ref_[ref_ind] = rnd_.Rand8();
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
-        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask_;
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask_;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
@@ -361,8 +367,10 @@
                                            log2height_, src_stride_coeff,
                                            ref_stride_coeff, &sse2,
                                            use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    EXPECT_EQ(sse1, sse2)
+        << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2)
+        << "Error at test index: " << i;
   }
 }
 

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 596427c..f304de3 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -353,11 +353,36 @@
   }
 }
 
+// TODO(slavarnway): Move this decoder version of
+// vp9_get_pred_context_switchable_interp() to vp9_pred_common.h and update the
+// encoder.
+//
+// Returns a context number for the given MB prediction signal
+static int dec_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int left_type = left_mi ? left_mi->interp_filter : SWITCHABLE_FILTERS;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const int above_type = above_mi ? above_mi->interp_filter
+                             : SWITCHABLE_FILTERS;
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+    return above_type;
+  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+    return left_type;
+  else
+    return SWITCHABLE_FILTERS;
+}
 
 static INLINE INTERP_FILTER read_switchable_interp_filter(
     VP9_COMMON *const cm, MACROBLOCKD *const xd,
     vpx_reader *r) {
-  const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  const int ctx = dec_get_pred_context_switchable_interp(xd);
   const INTERP_FILTER type =
       (INTERP_FILTER)vpx_read_tree(r, vp9_switchable_interp_tree,
                                    cm->fc->switchable_interp_prob[ctx]);
@@ -373,9 +398,6 @@
   const BLOCK_SIZE bsize = mi->sb_type;
   int i;
 
-  mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
-
   switch (bsize) {
     case BLOCK_4X4:
       for (i = 0; i < 4; ++i)
@@ -399,6 +421,13 @@
   }
 
   mi->uv_mode = read_intra_mode_uv(cm, xd, r, mi->mode);
+
+  // Initialize interp_filter here so we do not have to check for inter block
+  // modes in dec_get_pred_context_switchable_interp()
+  mi->interp_filter = SWITCHABLE_FILTERS;
+
+  mi->ref_frame[0] = INTRA_FRAME;
+  mi->ref_frame[1] = NONE;
 }
 
 static INLINE int is_mv_valid(const MV *mv) {

diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index abe6768..10ea010 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c

@@ -91,7 +91,7 @@
   } else {
     if (ne->value > ne->thresh)
       noise_level = kMedium;
-    else if (ne->value > ((5 * ne->thresh) >> 3))
+    else if (ne->value > ((9 * ne->thresh) >> 4))
       noise_level = kLow;
     else
       noise_level = kLowLow;

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 9dd7394..545441b 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -621,15 +621,16 @@
 
   if (cpi->common.frame_type == KEY_FRAME) {
     active_worst_quality = curr_frame == 0 ? rc->worst_quality
-                                           : rc->last_q[KEY_FRAME] * 2;
+                                           : rc->last_q[KEY_FRAME] << 1;
   } else {
     if (!rc->is_src_frame_alt_ref &&
         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-      active_worst_quality =  curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+      active_worst_quality =  curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 >> 2
                                               : rc->last_q[INTER_FRAME];
     } else {
-      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
-                                             : rc->last_q[INTER_FRAME] * 2;
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] << 1 :
+          VPXMIN(rc->last_q[INTER_FRAME] << 1,
+                (rc->avg_frame_qindex[INTER_FRAME] * 3 >> 1));
     }
   }
   return VPXMIN(active_worst_quality, rc->worst_quality);
@@ -662,7 +663,7 @@
                    VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
                           rc->avg_frame_qindex[KEY_FRAME]) :
                    rc->avg_frame_qindex[INTER_FRAME];
-  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 / 4);
+  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2);
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
@@ -2047,7 +2048,7 @@
     // for cases where there is small change from content that is completely
     // static.
     if (cpi->oxcf.rc_mode == VPX_VBR) {
-      min_thresh = 30000;
+      min_thresh = 40000;
       thresh = 2.0f;
     }
     if (avg_sad >
@@ -2061,14 +2062,15 @@
     // For VBR, under scene change/high content change, force golden refresh.
     if (cpi->oxcf.rc_mode == VPX_VBR &&
         rc->high_source_sad &&
-        cpi->refresh_golden_frame == 0 &&
         cpi->ext_refresh_frame_flags_pending == 0) {
       int target;
       cpi->refresh_golden_frame = 1;
+      rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
+      rc->baseline_gf_interval = VPXMIN(20,
+          VPXMAX(10, rc->baseline_gf_interval));
       rc->frames_till_gf_update_due = rc->baseline_gf_interval;
       if (rc->frames_till_gf_update_due > rc->frames_to_key)
         rc->frames_till_gf_update_due = rc->frames_to_key;
-      rc->gfu_boost = DEFAULT_GF_BOOST;
       target = calc_pframe_target_size_one_pass_vbr(cpi);
       vp9_rc_set_frame_target(cpi, target);
     }

diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c
index 39596b1..a5802e1 100644
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c

@@ -365,12 +365,12 @@
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  tran_low_t sum = 0;
+  int sum = 0;
   for (r = 0; r < 16; ++r)
     for (c = 0; c < 16; ++c)
       sum += input[r * stride + c];
 
-  output[0] = sum >> 1;
+  output[0] = (tran_low_t)(sum >> 1);
 }
 
 static INLINE tran_high_t dct_32_round(tran_high_t input) {

diff --git a/vpx_dsp/mips/fwd_txfm_msa.c b/vpx_dsp/mips/fwd_txfm_msa.c
index f66dd5f..0dd141f 100644
--- a/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/vpx_dsp/mips/fwd_txfm_msa.c

@@ -237,11 +237,9 @@
 }
 
 void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[1] = 0;
-
-  out[0] = LD_HADD(input, stride);
-  out[0] += LD_HADD(input + 8, stride);
-  out[0] += LD_HADD(input + 16 * 8, stride);
-  out[0] += LD_HADD(input + 16 * 8 + 8, stride);
-  out[0] >>= 1;
+  int sum = LD_HADD(input, stride);
+  sum += LD_HADD(input + 8, stride);
+  sum += LD_HADD(input + 16 * 8, stride);
+  sum += LD_HADD(input + 16 * 8 + 8, stride);
+  out[0] = (int16_t)(sum >> 1);
 }

diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c
index bca72e8..e4deeec 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c

@@ -40,7 +40,7 @@
 
   in1 = _mm_add_epi32(tmp, in0);
   in0 = _mm_slli_epi32(in1, 1);
-  store_output(&in0, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
 }
 
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
@@ -80,7 +80,7 @@
   in0 = _mm_srli_si128(sum, 8);
 
   in1 = _mm_add_epi32(sum, in0);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
@@ -91,40 +91,39 @@
   int i;
 
   for (i = 0; i < 2; ++i) {
-    input += 8 * i;
-    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
+    in0  = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+    in1  = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+    in2  = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+    in3  = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
 
     u0 = _mm_add_epi16(in0, in1);
     u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
+    in0  = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+    in1  = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+    in2  = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+    in3  = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0  = _mm_add_epi16(in0, in1);
     u1  = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
+    in0  = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+    in1  = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+    in2  = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+    in3  = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0  = _mm_add_epi16(in0, in1);
     u1  = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
+    in0  = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+    in1  = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+    in2  = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+    in3  = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0  = _mm_add_epi16(in0, in1);
@@ -132,6 +131,7 @@
     sum = _mm_add_epi16(sum, u0);
 
     sum = _mm_add_epi16(sum, u1);
+    input += 8 * stride;
   }
 
   u0  = _mm_setzero_si128();
@@ -149,7 +149,7 @@
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 1);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
@@ -221,7 +221,7 @@
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 3);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 #define DCT_HIGH_BIT_DEPTH 0

diff --git a/webmdec.cc b/webmdec.cc
index f541cfe..c4539a0 100644
--- a/webmdec.cc
+++ b/webmdec.cc

@@ -164,7 +164,7 @@
       }
       get_new_block = true;
     }
-    if (status) {
+    if (status || block_entry == NULL) {
       return -1;
     }
     if (get_new_block) {