Merge "Fix rd_pick_partition search loop for 4x4 blocks"
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index f5f59b1..295443c 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -3,7 +3,7 @@
 no strict 'refs';
 use warnings;
 use Getopt::Long;
-Getopt::Long::Configure("auto_help");
+Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32;
 
 my %ALL_FUNCS = ();
 my @ALL_ARCHS;
diff --git a/configure b/configure
index 604ad73..d570081 100755
--- a/configure
+++ b/configure
@@ -25,6 +25,7 @@
   ${toggle_docs}                  documentation
   ${toggle_unit_tests}            unit tests
   ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
+  ${toggle_encode_perf_tests}     build encoder perf tests with unit tests
   --libc=PATH                     path to alternate libc
   --size-limit=WxH                max size to allow in the decoder
   --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
@@ -273,7 +274,7 @@
 EXPERIMENT_LIST="
     multiple_arf
     spatial_svc
-    denoising
+    vp9_temporal_denoising
     fp_mb_stats
 "
 CONFIG_LIST="
@@ -324,6 +325,7 @@
     webm_io
     libyuv
     decode_perf_tests
+    encode_perf_tests
     multi_res_encoding
     temporal_denoising
     experimental
@@ -380,6 +382,7 @@
     webm_io
     libyuv
     decode_perf_tests
+    encode_perf_tests
     multi_res_encoding
     temporal_denoising
     experimental
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index b612f23..11529b3 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -92,6 +92,7 @@
   const double fps = double(frames) / elapsed_secs;
 
   printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
   printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
   printf("\t\"videoName\" : \"%s\",\n", video_name);
   printf("\t\"threadCount\" : %u,\n", threads);
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
new file mode 100644
index 0000000..feef37e
--- /dev/null
+++ b/test/encode_perf_test.cc
@@ -0,0 +1,170 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "./vpx_version.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+const double kUsecsInSec = 1000000.0;
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_),
+        width(width_),
+        height(height_),
+        bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484),
+  EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987),
+  EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718),
+  EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471),
+  EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv",
+                      640, 480, 200, 300),
+  EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 12 };
+
+#define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0]))
+
+class VP9EncodePerfTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  VP9EncodePerfTest()
+      : EncoderTest(GET_PARAM(0)),
+        min_psnr_(kMaxPsnr),
+        nframes_(0),
+        encoding_mode_(GET_PARAM(1)),
+        speed_(0) {}
+
+  virtual ~VP9EncodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_resize_allowed = 0;
+    cfg_.rc_end_usage = VPX_CBR;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, speed_);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPsnr;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_) {
+      min_psnr_= pkt->data.psnr.psnr[0];
+    }
+  }
+
+  // for performance reasons don't decode
+  virtual bool DoDecode() { return 0; }
+
+  double min_psnr() const {
+    return min_psnr_;
+  }
+
+  void set_speed(unsigned int speed) {
+    speed_ = speed;
+  }
+
+ private:
+  double min_psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+  unsigned speed_;
+};
+
+TEST_P(VP9EncodePerfTest, PerfTest) {
+  for (size_t i = 0; i < NELEMENTS(kVP9EncodePerfTestVectors); ++i) {
+    for (size_t j = 0; j < NELEMENTS(kEncodePerfTestSpeeds); ++j) {
+      SetUp();
+
+      const vpx_rational timebase = { 33333333, 1000000000 };
+      cfg_.g_timebase = timebase;
+      cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate;
+
+      init_flags_ = VPX_CODEC_USE_PSNR;
+
+      const unsigned frames = kVP9EncodePerfTestVectors[i].frames;
+      const char *video_name = kVP9EncodePerfTestVectors[i].name;
+      libvpx_test::I420VideoSource video(
+          video_name,
+          kVP9EncodePerfTestVectors[i].width,
+          kVP9EncodePerfTestVectors[i].height,
+          timebase.den, timebase.num, 0,
+          kVP9EncodePerfTestVectors[i].frames);
+      set_speed(kEncodePerfTestSpeeds[j]);
+
+      vpx_usec_timer t;
+      vpx_usec_timer_start(&t);
+
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+      vpx_usec_timer_mark(&t);
+      const double elapsed_secs = vpx_usec_timer_elapsed(&t) / kUsecsInSec;
+      const double fps = frames / elapsed_secs;
+      const double minimum_psnr = min_psnr();
+
+      printf("{\n");
+      printf("\t\"type\" : \"encode_perf_test\",\n");
+      printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+      printf("\t\"videoName\" : \"%s\",\n", video_name);
+      printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
+      printf("\t\"totalFrames\" : %u,\n", frames);
+      printf("\t\"framesPerSecond\" : %f,\n", fps);
+      printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
+      printf("\t\"speed\" : %d\n", kEncodePerfTestSpeeds[j]);
+      printf("}\n");
+    }
+  }
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    VP9EncodePerfTest, ::testing::Values(::libvpx_test::kRealTime));
+}  // namespace
diff --git a/test/scale_border_test.cc b/test/scale_border_test.cc
new file mode 100644
index 0000000..cc9a69a
--- /dev/null
+++ b/test/scale_border_test.cc
@@ -0,0 +1,182 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+namespace {
+
+typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
+
+class ExtendBorderTest
+    : public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+ public:
+  virtual ~ExtendBorderTest() {
+    libvpx_test::ClearSystemState();
+  }
+
+  void ResetImage(int width, int height) {
+    width_ = width;
+    height_ = height;
+    vpx_memset(&img_, 0, sizeof(img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+
+    vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
+    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+              img_.y_stride);
+    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+
+    vpx_memset(&ref_img_, 0, sizeof(ref_img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+
+    vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
+    FillPlane(ref_img_.y_buffer, ref_img_.y_crop_width, ref_img_.y_crop_height,
+              ref_img_.y_stride);
+    FillPlane(ref_img_.u_buffer,
+              ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+              ref_img_.uv_stride);
+    FillPlane(ref_img_.v_buffer,
+              ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+              ref_img_.uv_stride);
+  }
+
+  void DeallocImage() {
+    vp8_yv12_de_alloc_frame_buffer(&img_);
+    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+  }
+
+ private:
+  static const int kBufFiller = 123;
+  static const int kBufMax = kBufFiller - 1;
+
+  virtual void SetUp() {
+    extend_fn_ = GetParam();
+  }
+
+  static void FillPlane(uint8_t *buf, int width, int height, int stride) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
+      }
+    }
+  }
+
+  void ReferenceExtendBorder() {
+    ExtendPlane(ref_img_.y_buffer,
+                ref_img_.y_crop_width, ref_img_.y_crop_height,
+                ref_img_.y_width, ref_img_.y_height,
+                ref_img_.y_stride,
+                ref_img_.border);
+    ExtendPlane(ref_img_.u_buffer,
+                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+                ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride,
+                ref_img_.border / 2);
+    ExtendPlane(ref_img_.v_buffer,
+                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+                ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride,
+                ref_img_.border / 2);
+  }
+
+  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
+                          int width, int height, int stride, int padding) {
+    // Copy the outermost visible pixel to a distance of at least 'padding.'
+    // The buffers are allocated such that there may be excess space outside the
+    // padding. As long as the minimum amount of padding is achieved it is not
+    // necessary to fill this space as well.
+    uint8_t *left = buf - padding;
+    uint8_t *right = buf + crop_width;
+    const int right_extend = padding + (width - crop_width);
+    const int bottom_extend = padding + (height - crop_height);
+
+    // Fill the border pixels from the nearest image pixel.
+    for (int y = 0; y < crop_height; ++y) {
+      vpx_memset(left, left[padding], padding);
+      vpx_memset(right, right[-1], right_extend);
+      left += stride;
+      right += stride;
+    }
+
+    left = buf - padding;
+    uint8_t *top = left - (stride * padding);
+    // The buffer does not always extend as far as the stride.
+    // Equivalent to padding + width + padding.
+    const int extend_width = padding + crop_width + right_extend;
+
+    // The first row was already extended to the left and right. Copy it up.
+    for (int y = 0; y < padding; ++y) {
+      vpx_memcpy(top, left, extend_width);
+      top += stride;
+    }
+
+    uint8_t *bottom = left + (crop_height * stride);
+    for (int y = 0; y <  bottom_extend; ++y) {
+      vpx_memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+      bottom += stride;
+    }
+  }
+
+  void ExtendBorder() {
+    ASM_REGISTER_STATE_CHECK(extend_fn_(&img_));
+  }
+
+  void CompareImages() {
+    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, img_.buffer_alloc,
+                        ref_img_.frame_size));
+  }
+
+ protected:
+  void RunTest() {
+#if ARCH_ARM
+    // Some arm devices OOM when trying to allocate the largest buffers.
+    static const int kNumSizesToTest = 6;
+#else
+    static const int kNumSizesToTest = 7;
+#endif
+    static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383};
+    for (int h = 0; h < kNumSizesToTest; ++h) {
+      for (int w = 0; w < kNumSizesToTest; ++w) {
+        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ExtendBorder();
+        ReferenceExtendBorder();
+        CompareImages();
+        DeallocImage();
+      }
+    }
+  }
+
+  YV12_BUFFER_CONFIG img_;
+  YV12_BUFFER_CONFIG ref_img_;
+  ExtendFrameBorderFunc extend_fn_;
+  int width_;
+  int height_;
+};
+
+TEST_P(ExtendBorderTest, ExtendBorder) {
+  ASSERT_NO_FATAL_FAILURE(RunTest());
+}
+
+INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest,
+                        ::testing::Values(vp8_yv12_extend_frame_borders_c));
+}  // namespace
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 74f7842..b7a8f3f 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -669,3 +669,12 @@
 cc75f351818b9a619818f5cc77b9bc013d0c1e11  vp90-2-17-show-existing-frame.webm.md5
 0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
 367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
+eb438c6540eb429f74404eedfa3228d409c57874  desktop_640_360_30.yuv
+89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab  kirland_640_480_30.yuv
+33c533192759e5bb4f07abfbac389dc259db4686  macmarcomoving_640_480_30.yuv
+8bfaab121080821b8f03b23467911e59ec59b8fe  macmarcostationary_640_480_30.yuv
+70894878d916a599842d9ad0dcd24e10c13e5467  niklas_640_480_30.yuv
+8784b6df2d8cc946195a90ac00540500d2e522e4  tacomanarrows_640_480_30.yuv
+edd86a1f5e62fd9da9a9d46078247759c2638009  tacomasmallcameramovement_640_480_30.yuv
+9a70e8b7d14fba9234d0e51dce876635413ce444  thaloundeskmtg_640_480_30.yuv
+e7d315dbf4f3928779e0dc624311196d44491d32  niklas_1280_720_30.yuv
diff --git a/test/test.mk b/test/test.mk
index e3f3054..4355237 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -69,6 +69,11 @@
 LIBVPX_TEST_SRCS-yes                   += decode_perf_test.cc
 endif
 
+# encode perf tests are vp9 only
+ifeq ($(CONFIG_ENCODE_PERF_TESTS)$(CONFIG_VP9_ENCODER), yesyes)
+LIBVPX_TEST_SRCS-yes += encode_perf_test.cc
+endif
+
 ##
 ## WHITE BOX TESTS
 ##
@@ -94,6 +99,7 @@
 
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
+LIBVPX_TEST_SRCS-yes                   += scale_border_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 
 endif # VP8
@@ -838,3 +844,15 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
   vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
 endif  # CONFIG_DECODE_PERF_TESTS
+
+ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
+endif  # CONFIG_ENCODE_PERF_TESTS
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 5469770..9dc7c6a 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -90,14 +90,14 @@
 
     rnd(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
-    src_ = new uint8_t[block_size_];
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
     ref_ = new uint8_t[block_size_];
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
   }
 
   virtual void TearDown() {
-    delete[] src_;
+    vpx_free(src_);
     delete[] ref_;
     libvpx_test::ClearSystemState();
   }
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 12e5011..469d0d6 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1879,6 +1879,13 @@
      */
     cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
     cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 5;
+    if (cpi->oxcf.number_of_layers == 1) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 20;
+    } else if (cpi->oxcf.number_of_layers == 2) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 10;
+    }
     cpi->cyclic_refresh_mode_index = 0;
     cpi->cyclic_refresh_q = 32;
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 28c674a..cc6f955 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -627,9 +627,13 @@
                        "Width and height beyond allowed size.");
 #endif
   if (cm->width != width || cm->height != height) {
+    const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+    const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+
     // Change in frame size (assumption: color format does not change).
     if (cm->width == 0 || cm->height == 0 ||
-        width * height > cm->width * cm->height) {
+        aligned_width > cm->width ||
+        aligned_width * aligned_height > cm->width * cm->height) {
       if (vp9_alloc_context_buffers(cm, width, height))
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate frame buffers");
@@ -662,6 +666,7 @@
                                        struct vp9_read_bit_buffer *rb) {
   int width, height;
   int found = 0, i;
+  int has_valid_ref_frame = 0;
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     if (vp9_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
@@ -675,15 +680,21 @@
   if (!found)
     vp9_read_frame_size(rb, &width, &height);
 
-  // Check that each of the frames that this frame references has valid
-  // dimensions.
+  if (width <=0 || height <= 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid frame size");
+
+  // Check to make sure at least one of frames that this frame references
+  // has valid dimensions.
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    if (!valid_ref_frame_size(ref_frame->buf->y_width, ref_frame->buf->y_height,
-                              width, height))
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Referenced frame has invalid size");
+    has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_width,
+                                                ref_frame->buf->y_height,
+                                                width, height);
   }
+  if (!has_valid_ref_frame)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Referenced frame has invalid size");
 
   resize_context_buffers(cm, width, height);
   setup_display_size(cm, rb);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 1afaee1..32e80f9 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -435,6 +435,11 @@
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    const int ref_idx = frame - LAST_FRAME;
+    if (cm->frame_refs[ref_idx].sf.x_scale_fp == REF_INVALID_SCALE ||
+        cm->frame_refs[ref_idx].sf.y_scale_fp == REF_INVALID_SCALE )
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
     vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
                      mi_row, mi_col);
   }
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index 6d76914..b11a0ae 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -41,7 +41,7 @@
   int64_t tx_rd_diff[TX_MODES];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   unsigned int newmv_sse;
   unsigned int zeromv_sse;
   PREDICTION_MODE best_sse_inter_mode;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index fc329b0..584bcb8 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1333,7 +1333,7 @@
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
   update_state_rt(cpi, ctx, mi_row, mi_col, bsize);
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0 && output_enabled) {
     vp9_denoiser_denoise(&cpi->denoiser, &cpi->mb, mi_row, mi_col,
                          MAX(BLOCK_8X8, bsize), ctx);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b1a5e08..f8d2611 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -131,7 +131,8 @@
   }
 
   if (cm->frame_type == KEY_FRAME) {
-    cpi->refresh_golden_frame = 1;
+    if (!(cpi->use_svc && cpi->svc.number_temporal_layers == 1))
+      cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
   } else {
     cm->fc = cm->frame_contexts[cm->frame_context_idx];
@@ -671,7 +672,7 @@
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
                        cm->subsampling_x, cm->subsampling_y,
@@ -865,7 +866,7 @@
   cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
   yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
@@ -1119,7 +1120,7 @@
 #endif
   }
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_free(&(cpi->denoiser));
   }
@@ -1143,7 +1144,7 @@
   vp9_remove_common(&cpi->common);
   vpx_free(cpi);
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
   fclose(yuv_denoised_file);
 #endif
@@ -1357,7 +1358,7 @@
 }
 #endif
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #if defined(OUTPUT_YUV_DENOISED)
 // The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
 // as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
@@ -1604,7 +1605,7 @@
     ref_cnt_fb(cm->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
   }
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_update_frame_info(&cpi->denoiser,
                                    *cpi->Source,
@@ -2001,7 +2002,8 @@
   if (cpi->gold_is_last)
     cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
 
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX)
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+      !(cpi->use_svc && cpi->svc.number_temporal_layers == 1))
     cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
 
   if (cpi->alt_is_last)
@@ -2241,7 +2243,7 @@
     encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
   }
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1419cf6..4b3f2ad 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -37,7 +37,7 @@
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_variance.h"
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
 #endif
 
@@ -430,7 +430,7 @@
   int multi_arf_enabled;
   int multi_arf_last_grp_enabled;
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
 } VP9_COMP;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 0140fb5..30a0e9d 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -602,7 +602,7 @@
         }
       }
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
       if (cpi->oxcf.noise_sensitivity > 0) {
         vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y,
                                         this_mode, ctx);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index f0f9afc..1adbad9 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1238,6 +1238,7 @@
 
     if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
+      cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
     }
 
     if (cpi->pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
@@ -1252,7 +1253,10 @@
         lc->is_key_frame = 0;
       } else {
         lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
+        if (lc->is_key_frame)
+          cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
       }
+      cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
     }
 
     if (cpi->pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 30a585c..f65ac7b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -434,22 +434,26 @@
   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 
   txfm_rd_in_plane(x, rate, distortion, skip,
-                   &sse[mbmi->tx_size], ref_best_rd, 0, bs,
+                   sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
   cpi->tx_stepdown_count[0]++;
 }
 
 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
-                                   int (*r)[2], int *rate,
-                                   int64_t *d, int64_t *distortion,
-                                   int *s, int *skip,
+                                   int *rate,
+                                   int64_t *distortion,
+                                   int *skip,
+                                   int64_t *psse,
                                    int64_t tx_cache[TX_MODES],
+                                   int64_t ref_best_rd,
                                    BLOCK_SIZE bs) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+  int r[TX_SIZES][2], s[TX_SIZES];
+  int64_t d[TX_SIZES], sse[TX_SIZES];
   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
@@ -466,6 +470,9 @@
   s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
+                     &sse[n], ref_best_rd, 0, bs, n,
+                     cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
     if (r[n][0] < INT_MAX) {
       for (m = 0; m <= n - (n == max_tx_size); m++) {
@@ -496,6 +503,7 @@
   *distortion = d[mbmi->tx_size];
   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
   *skip       = s[mbmi->tx_size];
+  *psse       = sse[mbmi->tx_size];
 
   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
@@ -522,65 +530,39 @@
                                   int64_t *psse, BLOCK_SIZE bs,
                                   int64_t txfm_cache[TX_MODES],
                                   int64_t ref_best_rd) {
-  int r[TX_SIZES][2], s[TX_SIZES];
-  int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  TX_SIZE tx_size;
 
-  assert(bs == mbmi->sb_type);
+  assert(bs == xd->mi[0]->mbmi.sb_type);
 
   vp9_subtract_plane(x, bs, 0);
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
-    choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd,
+    choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd,
                            bs);
-    if (psse)
-      *psse = sse[mbmi->tx_size];
-    return;
+  } else {
+    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse,
+                           txfm_cache, ref_best_rd, bs);
   }
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size],
-                     &sse[tx_size], ref_best_rd, 0, bs, tx_size,
-                     cpi->sf.use_fast_coef_costing);
-  choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s,
-                         skip, txfm_cache, bs);
-
-  if (psse)
-    *psse = sse[mbmi->tx_size];
 }
 
 static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                   int64_t *distortion, int *skip,
-                                  int64_t *psse, BLOCK_SIZE bs,
+                                  BLOCK_SIZE bs,
                                   int64_t txfm_cache[TX_MODES],
                                   int64_t ref_best_rd) {
-  int64_t sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t sse;
 
-  assert(bs == mbmi->sb_type);
+  assert(bs == xd->mi[0]->mbmi.sb_type);
   if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
-    choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd,
+    choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd,
                            bs);
   } else {
-    int r[TX_SIZES][2], s[TX_SIZES];
-    int64_t d[TX_SIZES];
-    TX_SIZE tx_size;
-    for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
-      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
-                       &s[tx_size], &sse[tx_size],
-                       ref_best_rd, 0, bs, tx_size,
-                       cpi->sf.use_fast_coef_costing);
-    choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
-                           bs);
+    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse,
+                           txfm_cache, ref_best_rd, bs);
   }
-  if (psse)
-    *psse = sse[mbmi->tx_size];
 }
 
 
@@ -834,7 +816,7 @@
     mic->mbmi.mode = mode;
 
     intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-        &s, NULL, bsize, local_tx_cache, best_rd);
+        &s, bsize, local_tx_cache, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -2722,7 +2704,7 @@
 
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                             bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 3f2f5b9..3381cb9 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -23,8 +23,8 @@
 VP9_CX_SRCS-yes += encoder/vp9_cost.h
 VP9_CX_SRCS-yes += encoder/vp9_cost.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
-VP9_CX_SRCS-$(CONFIG_DENOISING) += encoder/vp9_denoiser.c
-VP9_CX_SRCS-$(CONFIG_DENOISING) += encoder/vp9_denoiser.h
+VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c
+VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
deleted file mode 100644
index b2eb9eb..0000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
+++ /dev/null
@@ -1,308 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_yv12_extend_frame_borders_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_scale_asm_offsets.asm
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
-; we depend on VP8BORDERINPIXELS being 32
-
-|vp8_yv12_extend_frame_borders_neon| PROC
-    push            {r4 - r10, lr}
-    vpush           {d8 - d15}
-
-    ; Border = 32
-    ldr             r3, [r0, #yv12_buffer_config_y_width]  ; plane_width
-    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1
-    ldr             r4, [r0, #yv12_buffer_config_y_height] ; plane_height
-    ldr             lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride
-
-; Border copy for Y plane
-; copy the left and right most columns out
-    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
-    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
-    sub             r5, r1, #32             ; dest_ptr1 = src_ptr1 - Border
-
-    mov             r12, r4, lsr #2         ; plane_height / 4
-
-copy_left_right_y
-    vld1.8          {d0[], d1[]}, [r1], lr
-    vld1.8          {d4[], d5[]}, [r2], lr
-    vld1.8          {d8[], d9[]}, [r1], lr
-    vld1.8          {d12[], d13[]}, [r2], lr
-    vld1.8          {d16[], d17[]}, [r1], lr
-    vld1.8          {d20[], d21[]}, [r2], lr
-    vld1.8          {d24[], d25[]}, [r1], lr
-    vld1.8          {d28[], d29[]}, [r2], lr
-
-    vmov            q1, q0
-    vmov            q3, q2
-    vmov            q5, q4
-    vmov            q7, q6
-    vmov            q9, q8
-    vmov            q11, q10
-    vmov            q13, q12
-    vmov            q15, q14
-
-    subs            r12, r12, #1
-
-    vst1.8          {q0, q1}, [r5], lr
-    vst1.8          {q2, q3}, [r6], lr
-    vst1.8          {q4, q5}, [r5], lr
-    vst1.8          {q6, q7}, [r6], lr
-    vst1.8          {q8, q9}, [r5], lr
-    vst1.8          {q10, q11}, [r6], lr
-    vst1.8          {q12, q13}, [r5], lr
-    vst1.8          {q14, q15}, [r6], lr
-
-    bne             copy_left_right_y
-
-;Now copy the top and bottom source lines into each line of the respective borders
-    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer
-    mul             r8, r4, lr              ; plane_height * plane_stride
-
-    ; copy width is plane_stride
-    movs            r12, lr, lsr #7         ; plane_stride / 128
-
-    sub             r1, r1, #32             ; src_ptr1 = y_buffer - Border
-    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))
-    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
-    sub             r5, r1, lr, asl #5      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
-    ble             extra_y_copy_needed     ; plane stride < 128
-
-copy_top_bottom_y
-    vld1.8          {q0, q1}, [r1]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q2, q3}, [r1]!
-    vld1.8          {q10, q11}, [r2]!
-    vld1.8          {q4, q5}, [r1]!
-    vld1.8          {q12, q13}, [r2]!
-    vld1.8          {q6, q7}, [r1]!
-    vld1.8          {q14, q15}, [r2]!
-
-    mov             r7, #32                 ; Border
-
-top_bottom_32
-    subs            r7, r7, #1
-
-    vst1.8          {q0, q1}, [r5]!
-    vst1.8          {q8, q9}, [r6]!
-    vst1.8          {q2, q3}, [r5]!
-    vst1.8          {q10, q11}, [r6]!
-    vst1.8          {q4, q5}, [r5]!
-    vst1.8          {q12, q13}, [r6]!
-    vst1.8          {q6, q7}, [r5]!
-    vst1.8          {q14, q15}, [r6]!
-
-    add             r5, r5, lr              ; dest_ptr1 += plane_stride
-    sub             r5, r5, #128            ; dest_ptr1 -= 128
-    add             r6, r6, lr              ; dest_ptr2 += plane_stride
-    sub             r6, r6, #128            ; dest_ptr2 -= 128
-
-    bne             top_bottom_32
-
-    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border* plane_stride)
-    add             r6, r2, lr              ; src_ptr2 + plane_stride
-
-    subs            r12, r12, #1
-    bne             copy_top_bottom_y
-
-extra_y_copy_needed
-    mov             r7, lr, lsr #4          ; check to see if extra copy is needed
-    ands            r7, r7, #0x7
-    bne             extra_top_bottom_y
-end_of_border_copy_y
-
-;Border copy for U, V planes
-; Border = 16
-    ldr             r7, [r0, #yv12_buffer_config_u_buffer]  ; src_ptr1
-    ldr             lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride
-    ldr             r3, [r0, #yv12_buffer_config_uv_width]  ; plane_width
-    ldr             r4, [r0, #yv12_buffer_config_uv_height] ; plane_height
-
-    mov             r10, #2
-
-;copy the left and right most columns out
-border_copy_uv
-    mov             r1, r7                  ; src_ptr1 needs to be saved for second half of loop
-    sub             r5, r1, #16             ; dest_ptr1 = src_ptr1 - Border
-    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
-    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
-
-    mov             r12, r4, lsr #3         ; plane_height / 8
-
-copy_left_right_uv
-    vld1.8          {d0[], d1[]}, [r1], lr
-    vld1.8          {d2[], d3[]}, [r2], lr
-    vld1.8          {d4[], d5[]}, [r1], lr
-    vld1.8          {d6[], d7[]}, [r2], lr
-    vld1.8          {d8[], d9[]},  [r1], lr
-    vld1.8          {d10[], d11[]}, [r2], lr
-    vld1.8          {d12[], d13[]}, [r1], lr
-    vld1.8          {d14[], d15[]}, [r2], lr
-    vld1.8          {d16[], d17[]}, [r1], lr
-    vld1.8          {d18[], d19[]}, [r2], lr
-    vld1.8          {d20[], d21[]}, [r1], lr
-    vld1.8          {d22[], d23[]}, [r2], lr
-    vld1.8          {d24[], d25[]}, [r1], lr
-    vld1.8          {d26[], d27[]}, [r2], lr
-    vld1.8          {d28[], d29[]}, [r1], lr
-    vld1.8          {d30[], d31[]}, [r2], lr
-
-    subs            r12, r12, #1
-
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q1}, [r6], lr
-    vst1.8          {q2}, [r5], lr
-    vst1.8          {q3}, [r6], lr
-    vst1.8          {q4}, [r5], lr
-    vst1.8          {q5}, [r6], lr
-    vst1.8          {q6}, [r5], lr
-    vst1.8          {q7}, [r6], lr
-    vst1.8          {q8}, [r5], lr
-    vst1.8          {q9}, [r6], lr
-    vst1.8          {q10}, [r5], lr
-    vst1.8          {q11}, [r6], lr
-    vst1.8          {q12}, [r5], lr
-    vst1.8          {q13}, [r6], lr
-    vst1.8          {q14}, [r5], lr
-    vst1.8          {q15}, [r6], lr
-
-    bne             copy_left_right_uv
-
-;Now copy the top and bottom source lines into each line of the respective borders
-    mov             r1, r7
-    mul             r8, r4, lr              ; plane_height * plane_stride
-    movs            r12, lr, lsr #6         ; plane_stride / 64
-
-    sub             r1, r1, #16             ; src_ptr1 = u_buffer - Border
-    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)
-    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
-    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
-    ble             extra_uv_copy_needed    ; plane_stride < 64
-
-copy_top_bottom_uv
-    vld1.8          {q0, q1}, [r1]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q2, q3}, [r1]!
-    vld1.8          {q10, q11}, [r2]!
-
-    mov             r7, #16                 ; Border
-
-top_bottom_16
-    subs            r7, r7, #1
-
-    vst1.8          {q0, q1}, [r5]!
-    vst1.8          {q8, q9}, [r6]!
-    vst1.8          {q2, q3}, [r5]!
-    vst1.8          {q10, q11}, [r6]!
-
-    add             r5, r5, lr              ; dest_ptr1 += plane_stride
-    sub             r5, r5, #64
-    add             r6, r6, lr              ; dest_ptr2 += plane_stride
-    sub             r6, r6, #64
-
-    bne             top_bottom_16
-
-    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
-    add             r6, r2, lr              ; dest_ptr2 = src_ptr2 + plane_stride
-
-    subs            r12, r12, #1
-    bne             copy_top_bottom_uv
-extra_uv_copy_needed
-    mov             r7, lr, lsr #3          ; check to see if extra copy is needed
-    ands            r7, r7, #0x7
-    bne             extra_top_bottom_uv
-
-end_of_border_copy_uv
-    subs            r10, r10, #1
-    ldrne           r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1
-    bne             border_copy_uv
-
-    vpop            {d8 - d15}
-    pop             {r4 - r10, pc}
-
-;;;;;;;;;;;;;;;;;;;;;;
-extra_top_bottom_y
-    vld1.8          {q0}, [r1]!
-    vld1.8          {q2}, [r2]!
-
-    mov             r9, #4                  ; 32 >> 3
-
-extra_top_bottom_32
-    subs            r9, r9, #1
-
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    bne             extra_top_bottom_32
-
-    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border * plane_stride)
-    add             r6, r2, lr              ; src_ptr2 + plane_stride
-    subs            r7, r7, #1
-    bne             extra_top_bottom_y
-
-    b               end_of_border_copy_y
-
-extra_top_bottom_uv
-    vld1.8          {d0}, [r1]!
-    vld1.8          {d8}, [r2]!
-
-    mov             r9, #2                  ; 16 >> 3
-
-extra_top_bottom_16
-    subs            r9, r9, #1
-
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    bne             extra_top_bottom_16
-
-    sub             r5, r1, lr, asl #4      ; src_ptr1 - (Border * plane_stride)
-    add             r6, r2, lr              ; src_ptr2 + plane_stride
-    subs            r7, r7, #1
-    bne             extra_top_bottom_uv
-
-    b               end_of_border_copy_uv
-
-    ENDP
-    END
diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c
index fac7bbc..d408eb3 100644
--- a/vpx_scale/arm/neon/yv12extend_arm.c
+++ b/vpx_scale/arm/neon/yv12extend_arm.c
@@ -17,5 +17,5 @@
 void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc,
                               struct yv12_buffer_config *dst_ybc) {
   vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
-  vp8_yv12_extend_frame_borders_neon(dst_ybc);
+  vp8_yv12_extend_frame_borders_c(dst_ybc);
 }
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 675d905..827bce7 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -81,6 +81,8 @@
     ybf->y_height = aligned_height;
     ybf->y_stride = y_stride;
 
+    ybf->uv_crop_width = (width + 1) / 2;
+    ybf->uv_crop_height = (height + 1) / 2;
     ybf->uv_width = uv_width;
     ybf->uv_height = uv_height;
     ybf->uv_stride = uv_stride;
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 614602a..036a505 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -56,6 +56,9 @@
 }
 
 void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+  const int uv_border = ybf->border / 2;
+
+  assert(ybf->border % 2 == 0);
   assert(ybf->y_height - ybf->y_crop_height < 16);
   assert(ybf->y_width - ybf->y_crop_width < 16);
   assert(ybf->y_height - ybf->y_crop_height >= 0);
@@ -68,16 +71,16 @@
                ybf->border + ybf->y_width - ybf->y_crop_width);
 
   extend_plane(ybf->u_buffer, ybf->uv_stride,
-               (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
-               ybf->border / 2, ybf->border / 2,
-               (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
-               (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+               ybf->uv_crop_width, ybf->uv_crop_height,
+               uv_border, uv_border,
+               uv_border + ybf->uv_height - ybf->uv_crop_height,
+               uv_border + ybf->uv_width - ybf->uv_crop_width);
 
   extend_plane(ybf->v_buffer, ybf->uv_stride,
-               (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
-               ybf->border / 2, ybf->border / 2,
-               (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
-               (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+               ybf->uv_crop_width, ybf->uv_crop_height,
+               uv_border, uv_border,
+               uv_border + ybf->uv_height - ybf->uv_crop_height,
+               uv_border + ybf->uv_width - ybf->uv_crop_width);
 }
 
 #if CONFIG_VP9
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index 95e7483..1fa41af 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -12,7 +12,6 @@
 #neon
 SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
 SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
 SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/yv12extend_arm.c
 
 #mips(dspr2)
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 2e3f1ff..5a7f973 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -17,8 +17,6 @@
 }
 
 add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
-specialize qw/vp8_yv12_extend_frame_borders neon_asm/;
-$vp8_yv12_extend_frame_borders_neon_asm=vp8_yv12_extend_frame_borders_neon;
 
 add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 specialize qw/vp8_yv12_copy_frame neon_asm/;