Merge "Moving count_mb_ref_frame_usage to macroblock struct"
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index ba70242..95be467 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -61,26 +61,26 @@
     s/:SHR:/ >> /g;
 
     # Convert ELSE to .else
-    s/ELSE/.else/g;
+    s/\bELSE\b/.else/g;
 
     # Convert ENDIF to .endif
-    s/ENDIF/.endif/g;
+    s/\bENDIF\b/.endif/g;
 
     # Convert ELSEIF to .elseif
-    s/ELSEIF/.elseif/g;
+    s/\bELSEIF\b/.elseif/g;
 
     # Convert LTORG to .ltorg
-    s/LTORG/.ltorg/g;
+    s/\bLTORG\b/.ltorg/g;
 
     # Convert endfunc to nothing.
-    s/endfunc//ig;
+    s/\bendfunc\b//ig;
 
     # Convert FUNCTION to nothing.
-    s/FUNCTION//g;
-    s/function//g;
+    s/\bFUNCTION\b//g;
+    s/\bfunction\b//g;
 
-    s/ENTRY//g;
-    s/MSARMASM/0/g;
+    s/\bENTRY\b//g;
+    s/\bMSARMASM\b/0/g;
     s/^\s+end\s+$//g;
 
     # Convert IF :DEF:to .if
@@ -149,11 +149,15 @@
     s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
 
     # ALIGN directive
-    s/ALIGN/.balign/g;
+    s/\bALIGN\b/.balign/g;
 
     # ARM code
     s/\sARM/.arm/g;
 
+    # push/pop
+    s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
+    s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
+
     # NEON code
     s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
     s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
@@ -189,7 +193,7 @@
     s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;
 
     # Begin macro definition
-    if (/MACRO/) {
+    if (/\bMACRO\b/) {
         $_ = <STDIN>;
         s/^/.macro/;
         s/\$//g;                # remove formal param reference
@@ -198,7 +202,7 @@
 
     # For macros, use \ to reference formal params
     s/\$/\\/g;                  # End macro definition
-    s/MEND/.endm/;              # No need to tell it where to stop assembling
+    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
     print;
     print "$comment_sub$comment\n" if defined $comment;
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 05bbabe..c99a01c 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -767,6 +767,7 @@
             ;;
         armv5te)
             soft_enable edsp
+            disable fast_unaligned
             ;;
         esac
 
diff --git a/libs.mk b/libs.mk
index 373c1cd..4115dd8 100644
--- a/libs.mk
+++ b/libs.mk
@@ -20,8 +20,16 @@
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk
 
+# If this is a universal (fat) binary, then all the subarchitectures have
+# already been built and our job is to stitch them together. The
+# BUILD_LIBVPX variable indicates whether we should be building
+# (compiling, linking) the library. The LIPO_LIBVPX variable indicates
+# that we're stitching.
+$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
+
 include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
 CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS))
+CODEC_DOC_SRCS += $(addprefix vpx/,$(call enabled,API_DOC_SRCS))
 
 include $(SRC_PATH_BARE)/vpx_mem/vpx_mem.mk
 CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS))
@@ -29,17 +37,17 @@
 include $(SRC_PATH_BARE)/vpx_scale/vpx_scale.mk
 CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
 
+include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
+CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
+
 
 ifeq ($(CONFIG_VP8_ENCODER),yes)
   VP8_PREFIX=vp8/
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h
-  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
   CODEC_DOC_SECTIONS += vp8 vp8_encoder
 endif
 
@@ -48,10 +56,8 @@
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
   CODEC_DOC_SECTIONS += vp8 vp8_decoder
 endif
 
@@ -83,29 +89,10 @@
 INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Debug/%)
 endif
 
-# If this is a universal (fat) binary, then all the subarchitectures have
-# already been built and our job is to stitch them together. The
-# BUILD_LIBVPX variable indicates whether we should be building
-# (compiling, linking) the library. The LIPO_LIBVPX variable indicates
-# that we're stitching.
-$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
-
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.sh
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx/vpx_integer.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/asm_offsets.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_timer.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emms.asm
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_abi_support.asm
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c
-endif
-CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c
-CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm.h
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
@@ -344,11 +331,6 @@
           $(RTCD_OPTIONS) $^ > $@
 CLEAN-OBJS += $(BUILD_PFX)vpx_rtcd.h
 
-CODEC_DOC_SRCS += vpx/vpx_codec.h \
-                  vpx/vpx_decoder.h \
-                  vpx/vpx_encoder.h \
-                  vpx/vpx_image.h
-
 ##
 ## libvpx test directives
 ##
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index f2a2031..6fbcb64 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -59,9 +59,13 @@
     /* Test the buffer model here before subtracting the frame. Do so because
      * the way the leaky bucket model works in libvpx is to allow the buffer to
      * empty - and then stop showing frames until we've got enough bits to
-     * show one. */
-    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-        << pkt->data.frame.pts;
+     * show one. As noted in comment below (issue 495), this does not currently
+     * apply to key frames. For now exclude key frames in condition below. */
+    bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true: false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+          << pkt->data.frame.pts;
+    }
 
     const int frame_size_in_bits = pkt->data.frame.sz * 8;
 
@@ -125,7 +129,12 @@
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 140);
 
-  for (int i = 70; i < 700; i += 200) {
+  // There is an issue for low bitrates in real-time mode, where the
+  // effective_datarate slightly overshoots the target bitrate.
+  // This is same the issue as noted about (#495).
+  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+  // when the issue is resolved.
+  for (int i = 100; i < 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 3610f025d..84afe7f 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -9,6 +9,7 @@
  */
 #include "test/decode_test_driver.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
 #include "test/video_source.h"
 
 namespace libvpx_test {
@@ -21,8 +22,9 @@
     ASSERT_EQ(VPX_CODEC_OK, res_init) << DecodeError();
   }
 
-  const vpx_codec_err_t res_dec = vpx_codec_decode(&decoder_,
-                                                   cxdata, size, NULL, 0);
+  vpx_codec_err_t res_dec;
+  REGISTER_STATE_CHECK(res_dec = vpx_codec_decode(&decoder_,
+                                                  cxdata, size, NULL, 0));
   ASSERT_EQ(VPX_CODEC_OK, res_dec) << DecodeError();
 }
 
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index ebb3959..56339ca 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -12,6 +12,7 @@
 #if CONFIG_VP8_DECODER
 #include "test/decode_test_driver.h"
 #endif
+#include "test/register_state_check.h"
 #include "test/video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -58,9 +59,10 @@
   }
 
   // Encode the frame
-  res = vpx_codec_encode(&encoder_,
-                         video.img(), video.pts(), video.duration(),
-                         frame_flags, deadline_);
+  REGISTER_STATE_CHECK(
+      res = vpx_codec_encode(&encoder_,
+                             video.img(), video.pts(), video.duration(),
+                             frame_flags, deadline_));
   ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
 }
 
diff --git a/test/idctllm_test.cc b/test/idctllm_test.cc
index dd42e22..1be5fa0 100644
--- a/test/idctllm_test.cc
+++ b/test/idctllm_test.cc
@@ -13,6 +13,7 @@
 #include "vpx_config.h"
 #include "vpx_rtcd.h"
 }
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
@@ -54,7 +55,7 @@
 {
     int i;
 
-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
     for(i=0; i<256; i++)
         if((i&0xF) < 4 && i<64)
@@ -68,7 +69,7 @@
     int i;
 
     input[0] = 4;
-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
     for(i=0; i<256; i++)
         if((i&0xF) < 4 && i<64)
@@ -85,7 +86,7 @@
         predict[i] = i;
 
     input[0] = 4;
-    UUT(input, predict, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
 
     for(i=0; i<256; i++)
         if((i&0xF) < 4 && i<64)
@@ -101,7 +102,7 @@
     for(i=0; i<16; i++)
         input[i] = i;
 
-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
     for(i=0; i<256; i++)
         if((i&0xF) > 3 || i>63)
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index d2e0d61..4c16c3f 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -11,6 +11,7 @@
 
 #include <string.h>
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
 #include "vpx_config.h"
@@ -246,8 +247,10 @@
 
   virtual void Predict(MB_PREDICTION_MODE mode) {
     mb_.mode_info_context->mbmi.mode = mode;
-    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[0] - 1, kStride,
-             data_ptr_[0], kStride);
+    REGISTER_STATE_CHECK(pred_fn_(&mb_,
+                                  data_ptr_[0] - kStride,
+                                  data_ptr_[0] - 1, kStride,
+                                  data_ptr_[0], kStride));
   }
 
   intra_pred_y_fn_t pred_fn_;
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index af2f3bd..9227449 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
 #include "vpx_config.h"
@@ -74,8 +75,8 @@
   // Initialize pixels in the output to 99.
   (void)vpx_memset(dst_image, 99, output_size);
 
-  GetParam()(src_image_ptr, dst_image_ptr, input_stride,
-             output_stride, block_width, flimits, 16);
+  REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr, input_stride,
+                                  output_stride, block_width, flimits, 16));
 
   static const uint8_t expected_data[block_height] = {
     4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
diff --git a/test/register_state_check.h b/test/register_state_check.h
new file mode 100644
index 0000000..fb3f53b
--- /dev/null
+++ b/test/register_state_check.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+
+#ifdef _WIN64
+
+#define _WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winnt.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+inline bool operator==(const M128A& lhs, const M128A& rhs) {
+  return (lhs.Low == rhs.Low && lhs.High == rhs.High);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+namespace libvpx_test {
+
+// Compares the state of xmm[6-15] at construction with their state at
+// destruction. These registers should be preserved by the callee on
+// Windows x64.
+// Usage:
+// {
+//   RegisterStateCheck reg_check;
+//   FunctionToVerify();
+// }
+class RegisterStateCheck {
+ public:
+  RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
+  ~RegisterStateCheck() { EXPECT_TRUE(Check()); }
+
+ private:
+  static bool StoreRegisters(CONTEXT* const context) {
+    const HANDLE this_thread = GetCurrentThread();
+    EXPECT_TRUE(this_thread != NULL);
+    context->ContextFlags = CONTEXT_FLOATING_POINT;
+    const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
+    EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
+    return context_saved;
+  }
+
+  // Compares the register state. Returns true if the states match.
+  bool Check() const {
+    if (!initialized_) return false;
+    CONTEXT post_context;
+    if (!StoreRegisters(&post_context)) return false;
+
+    const M128A* xmm_pre = &pre_context_.Xmm6;
+    const M128A* xmm_post = &post_context.Xmm6;
+    for (int i = 6; i <= 15; ++i) {
+      EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!";
+      ++xmm_pre;
+      ++xmm_post;
+    }
+    return !testing::Test::HasNonfatalFailure();
+  }
+
+  bool initialized_;
+  CONTEXT pre_context_;
+};
+
+#define REGISTER_STATE_CHECK(statement) do { \
+  libvpx_test::RegisterStateCheck reg_check; \
+  statement;                               \
+} while (false)
+
+}  // namespace libvpx_test
+
+#else  // !_WIN64
+
+namespace libvpx_test {
+
+class RegisterStateCheck {};
+#define REGISTER_STATE_CHECK(statement) statement
+
+}  // namespace libvpx_test
+
+#endif  // _WIN64
+
+#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 2b562e6..5a0653b 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -21,6 +21,7 @@
 }
 
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -65,9 +66,11 @@
 
   sad_m_by_n_fn_t sad_fn_;
   virtual unsigned int SAD(unsigned int max_sad) {
-    return sad_fn_(source_data_, source_stride_,
-                   reference_data_, reference_stride_,
-                   max_sad);
+    unsigned int ret;
+    REGISTER_STATE_CHECK(ret = sad_fn_(source_data_, source_stride_,
+                                       reference_data_, reference_stride_,
+                                       max_sad));
+    return ret;
   }
 
   // Sum of Absolute Differences. Given two blocks, calculate the absolute
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index 06f14a1..c9dcceb 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
@@ -136,8 +137,8 @@
 
   uint8_t *src = const_cast<uint8_t*>(test_data);
 
-  sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
-                  2, 2, dst_, kDstStride);
+  REGISTER_STATE_CHECK(sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
+                                       2, 2, dst_, kDstStride));
 
   for (int i = 0; i < height_; ++i)
     for (int j = 0; j < width_; ++j)
@@ -162,8 +163,9 @@
                                 xoffset, yoffset, dst_c_, kDstStride);
 
       // Run test.
-      sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
-                      xoffset, yoffset, dst_, kDstStride);
+      REGISTER_STATE_CHECK(
+          sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                          xoffset, yoffset, dst_, kDstStride));
 
       for (int i = 0; i < height_; ++i)
         for (int j = 0; j < width_; ++j)
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 99363de..60acf81 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -10,6 +10,7 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 extern "C" {
 #include "vpx_config.h"
 #include "vpx_rtcd.h"
@@ -77,7 +78,7 @@
       predictor += kDiffPredStride;
     }
 
-    GetParam()(&be, &bd, kDiffPredStride);
+    REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
 
     base_src = *be.base_src;
     src_diff = be.src_diff;
diff --git a/test/test.mk b/test/test.mk
index 7a11a27..982be5b 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -1,4 +1,5 @@
 LIBVPX_TEST_SRCS-yes += acm_random.h
+LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
 LIBVPX_TEST_SRCS-yes += util.h
diff --git a/vp8/common/x86/loopfilter_block_sse2.asm b/vp8/common/x86/loopfilter_block_sse2.asm
index 1c445ef..3d45c61 100644
--- a/vp8/common/x86/loopfilter_block_sse2.asm
+++ b/vp8/common/x86/loopfilter_block_sse2.asm
@@ -150,6 +150,7 @@
 
     push    rbp
     mov     rbp, rsp
+    SAVE_XMM 11
     push    r12
     push    r13
     mov     thresh, arg(4)
@@ -258,6 +259,7 @@
 %ifidn __OUTPUT_FORMAT__,x64
     pop    r13
     pop    r12
+    RESTORE_XMM
     pop    rbp
 %endif
 
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index fe77450..1434bcd 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -890,6 +890,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
     push        rsi
     push        rdi
     GET_GOT     rbx
@@ -957,6 +958,7 @@
     RESTORE_GOT
     pop         rdi
     pop         rsi
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 13bcaf6..c06f245 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -352,6 +352,7 @@
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index e52a707..88c06be 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -29,6 +29,13 @@
 #include "error_concealment.h"
 #endif
 
+#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn) do {                      \
+  CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n)));  \
+  memset((p), 0, (n) * sizeof(*(p)));                              \
+} while (0)
+
+
 extern void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
@@ -668,11 +675,10 @@
         pbi->b_multithreaded_rd = 1;
         pbi->decoding_thread_count = core_count - 1;
 
-        CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
-        vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
-        CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
+        CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
+        CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
+        CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
+        CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
 
         for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
         {
@@ -796,32 +802,32 @@
         uv_width = width >>1;
 
         /* Allocate an int for each mb row. */
-        CHECK_MEM_ERROR(pbi->mt_current_mb_col, vpx_malloc(sizeof(int) * pc->mb_rows));
+        CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
 
         /* Allocate memory for above_row buffers. */
-        CHECK_MEM_ERROR(pbi->mt_yabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));
 
-        CHECK_MEM_ERROR(pbi->mt_uabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
-        CHECK_MEM_ERROR(pbi->mt_vabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
         /* Allocate memory for left_col buffers. */
-        CHECK_MEM_ERROR(pbi->mt_yleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));
 
-        CHECK_MEM_ERROR(pbi->mt_uleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
 
-        CHECK_MEM_ERROR(pbi->mt_vleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
     }
 }
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 6af1ef8..1a234ca 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -33,7 +33,7 @@
 #endif
 #include "encodeframe.h"
 
-extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
 extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
                                      int prob_intra,
                                      int prob_last,
@@ -1389,7 +1389,7 @@
         }
         else
         {
-            vp8_stuff_mb(cpi, xd, t);
+            vp8_stuff_mb(cpi, x, t);
         }
     }
 
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 77c1c5a..d2b7872 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1109,7 +1109,9 @@
     }
     else
     {
-        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+        if (cpi->oxcf.number_of_layers == 1 &&
+           (cpi->common.refresh_alt_ref_frame ||
+            cpi->common.refresh_golden_frame))
             rate_correction_factor = cpi->gf_rate_correction_factor;
         else
             rate_correction_factor = cpi->rate_correction_factor;
@@ -1186,7 +1188,9 @@
         cpi->key_frame_rate_correction_factor = rate_correction_factor;
     else
     {
-        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+        if (cpi->oxcf.number_of_layers == 1 &&
+           (cpi->common.refresh_alt_ref_frame ||
+            cpi->common.refresh_golden_frame))
             cpi->gf_rate_correction_factor = rate_correction_factor;
         else
             cpi->rate_correction_factor = rate_correction_factor;
@@ -1209,11 +1213,13 @@
         {
             Q = cpi->oxcf.key_q;
         }
-        else if (cpi->common.refresh_alt_ref_frame)
+        else if (cpi->oxcf.number_of_layers == 1 &&
+            cpi->common.refresh_alt_ref_frame)
         {
             Q = cpi->oxcf.alt_q;
         }
-        else if (cpi->common.refresh_golden_frame)
+        else if (cpi->oxcf.number_of_layers == 1  &&
+            cpi->common.refresh_golden_frame)
         {
             Q = cpi->oxcf.gold_q;
         }
@@ -1232,7 +1238,9 @@
             correction_factor = cpi->key_frame_rate_correction_factor;
         else
         {
-            if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+            if (cpi->oxcf.number_of_layers == 1 &&
+               (cpi->common.refresh_alt_ref_frame ||
+                cpi->common.refresh_golden_frame))
                 correction_factor = cpi->gf_rate_correction_factor;
             else
                 correction_factor = cpi->rate_correction_factor;
@@ -1281,7 +1289,10 @@
 
             if (cpi->common.frame_type == KEY_FRAME)
                 zbin_oqmax = 0;
-            else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+            else if (cpi->oxcf.number_of_layers == 1 &&
+                (cpi->common.refresh_alt_ref_frame ||
+                (cpi->common.refresh_golden_frame &&
+                 !cpi->source_alt_ref_active)))
                 zbin_oqmax = 16;
             else
                 zbin_oqmax = ZBIN_OQ_MAX;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 5976297..0ae2f10 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -22,16 +22,9 @@
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
 endif
 
-VP8_CX_SRCS-yes += vp8_cx_iface.c
+VP8_CX_SRCS-yes += vp8cx.mk
 
-# encoder
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += algo/vpx_ref/cpu_id/include
-#INCLUDES += common
-#INCLUDES += encoder
+VP8_CX_SRCS-yes += vp8_cx_iface.c
 
 VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index b16615d..b030ee5 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -9,7 +9,7 @@
 ##
 
 
-#VP8_CX_SRCS list is modified according to different platforms.
+VP8_CX_SRCS-$(ARCH_ARM)  += vp8cx_arm.mk
 
 #File list for arm
 # encoder
diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index 2cfd280..dd39190 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -18,6 +18,8 @@
 VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
 VP8_DX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
 
+VP8_DX_SRCS-yes += vp8dx.mk
+
 VP8_DX_SRCS-yes += vp8_dx_iface.c
 
 # common
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index d92e165..243b7a5 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -49,15 +49,22 @@
 #ifndef DEPRECATED
 #if defined(__GNUC__) && __GNUC__
 #define DEPRECATED          __attribute__ ((deprecated))
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #elif defined(_MSC_VER)
 #define DEPRECATED
-#define DECLSPEC_DEPRECATED __declspec(deprecated) /**< \copydoc #DEPRECATED */
 #else
 #define DEPRECATED
+#endif
+#endif  /* DEPRECATED */
+
+#ifndef DECLSPEC_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#elif defined(_MSC_VER)
+#define DECLSPEC_DEPRECATED __declspec(deprecated) /**< \copydoc #DEPRECATED */
+#else
 #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #endif
-#endif
+#endif  /* DECLSPEC_DEPRECATED */
 
     /*!\brief Decorator indicating a function is potentially unused */
 #ifdef UNUSED
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index 427fd0f..ffa123f 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -11,6 +11,21 @@
 
 API_EXPORTS += exports
 
+API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
+API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
+API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
+API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
+
+API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
+API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
+API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
+API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
+
+API_DOC_SRCS-yes += vpx_codec.h
+API_DOC_SRCS-yes += vpx_decoder.h
+API_DOC_SRCS-yes += vpx_encoder.h
+API_DOC_SRCS-yes += vpx_image.h
+
 API_SRCS-yes                += src/vpx_decoder.c
 API_SRCS-yes                += vpx_decoder.h
 API_SRCS-yes                += src/vpx_encoder.c
@@ -23,3 +38,4 @@
 API_SRCS-yes                += vpx_codec_impl_bottom.h
 API_SRCS-yes                += vpx_codec_impl_top.h
 API_SRCS-yes                += vpx_image.h
+API_SRCS-$(BUILD_LIBVPX)    += vpx_integer.h
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
index 8ff95a1..74d8a1f 100644
--- a/vpx_ports/arm_cpudetect.c
+++ b/vpx_ports/arm_cpudetect.c
@@ -125,7 +125,11 @@
 }
 
 #elif defined(__ANDROID__) /* end _MSC_VER */
+#if defined(CHROMIUM_BUILD)
+#include <machine/cpu-features.h>
+#else
 #include <cpu-features.h>
+#endif
 
 int arm_cpu_caps(void)
 {
diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
new file mode 100644
index 0000000..e6cb52f
--- /dev/null
+++ b/vpx_ports/vpx_ports.mk
@@ -0,0 +1,26 @@
+##
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+PORTS_SRCS-yes += vpx_ports.mk
+
+PORTS_SRCS-$(BUILD_LIBVPX) += asm_offsets.h
+PORTS_SRCS-$(BUILD_LIBVPX) += mem.h
+PORTS_SRCS-$(BUILD_LIBVPX) += vpx_timer.h
+
+ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
+PORTS_SRCS-$(BUILD_LIBVPX) += emms.asm
+PORTS_SRCS-$(BUILD_LIBVPX) += x86.h
+PORTS_SRCS-$(BUILD_LIBVPX) += x86_abi_support.asm
+PORTS_SRCS-$(BUILD_LIBVPX) += x86_cpuid.c
+endif
+
+PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c
+PORTS_SRCS-$(ARCH_ARM) += arm.h
diff --git a/vpxenc.c b/vpxenc.c
index 7449e6c..c9547ea 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -2284,8 +2284,9 @@
             break;
         case VPX_CODEC_STATS_PKT:
             stream->frames_out++;
-            fprintf(stderr, " %6luS",
-                   (unsigned long)pkt->data.twopass_stats.sz);
+            if (!global->quiet)
+                fprintf(stderr, " %6luS",
+                       (unsigned long)pkt->data.twopass_stats.sz);
             stats_write(&stream->stats,
                         pkt->data.twopass_stats.buf,
                         pkt->data.twopass_stats.sz);