Merge "arm: Fix building vp8_mse16x16_neon.c with MSVC"
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 44eba33..70b3009 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -71,6 +71,7 @@
     if (ext_fb_list_[idx].size < min_size) {
       delete [] ext_fb_list_[idx].data;
       ext_fb_list_[idx].data = new uint8_t[min_size];
+      memset(ext_fb_list_[idx].data, 0, min_size);
       ext_fb_list_[idx].size = min_size;
     }
 
diff --git a/test/test.mk b/test/test.mk
index b92b6da..abf815c 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -128,6 +128,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
new file mode 100644
index 0000000..7d08d9e
--- /dev/null
+++ b/test/vp9_intrapred_test.cc
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "test/util.h"
+
+namespace {
+
+using libvpx_test::ACMRandom;
+
+const int count_test_block = 100000;
+
+// Base class for VP9 intra prediction tests.
+class VP9IntraPredBase {
+ public:
+  virtual ~VP9IntraPredBase() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  virtual void Predict(PREDICTION_MODE mode) = 0;
+
+  void CheckPrediction(int test_case_number, int *error_count) const {
+    // For each pixel ensure that the calculated value is the same as reference.
+    for (int y = 0; y < block_size_; y++) {
+      for (int x = 0; x < block_size_; x++) {
+        *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
+        if (*error_count == 1) {
+          ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
+              << " Failed on Test Case Number "<< test_case_number;
+        }
+      }
+    }
+  }
+
+  void RunTest(uint16_t* left_col, uint16_t* above_data,
+               uint16_t* dst, uint16_t* ref_dst) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    left_col_ = left_col;
+    dst_ = dst;
+    ref_dst_ = ref_dst;
+    above_row_ = above_data + 16;
+    int error_count = 0;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x <= block_size_*2; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+      for (int y = 0; y < block_size_; y++) {
+        if (i == 0) {
+          left_col_[y] = mask_;
+        } else {
+          left_col_[y] = rnd.Rand16() & mask_;
+        }
+      }
+      Predict(DC_PRED);
+      CheckPrediction(i, &error_count);
+    }
+    ASSERT_EQ(0, error_count);
+  }
+
+  int block_size_;
+  uint16_t *above_row_;
+  uint16_t *left_col_;
+  uint16_t *dst_;
+  uint16_t *ref_dst_;
+  ptrdiff_t stride_;
+  int mask_;
+};
+
+typedef void (*intra_pred_fn_t)(
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+      const uint16_t *left, int bps);
+typedef std::tr1::tuple<intra_pred_fn_t,
+                        intra_pred_fn_t, int, int> intra_pred_params_t;
+class VP9IntraPredTest
+    : public VP9IntraPredBase,
+      public ::testing::TestWithParam<intra_pred_params_t> {
+
+  virtual void SetUp() {
+    pred_fn_    = GET_PARAM(0);
+    ref_fn_     = GET_PARAM(1);
+    block_size_ = GET_PARAM(2);
+    bit_depth_  = GET_PARAM(3);
+    stride_     = block_size_ * 3;
+    mask_       = (1 << bit_depth_) - 1;
+  }
+
+  virtual void Predict(PREDICTION_MODE mode) {
+    const uint16_t *const_above_row = above_row_;
+    const uint16_t *const_left_col = left_col_;
+    ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(pred_fn_(dst_, stride_, const_above_row,
+                                      const_left_col, bit_depth_));
+  }
+  intra_pred_fn_t pred_fn_;
+  intra_pred_fn_t ref_fn_;
+  int bit_depth_;
+};
+
+TEST_P(VP9IntraPredTest, IntraPredTests) {
+  // max block size is 32
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 2*32);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 2*32+32);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst, 3 * 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_dst, 3 * 32 * 32);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vp9_high_dc_predictor_32x32_sse2,
+                                       &vp9_high_dc_predictor_32x32_c, 32, 8),
+                            make_tuple(&vp9_high_tm_predictor_16x16_sse2,
+                                       &vp9_high_tm_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_high_tm_predictor_32x32_sse2,
+                                       &vp9_high_tm_predictor_32x32_c, 32, 8),
+                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
+                                       &vp9_high_dc_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
+                                       &vp9_high_dc_predictor_8x8_c, 8, 8),
+                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
+                                       &vp9_high_dc_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_high_v_predictor_4x4_sse,
+                                       &vp9_high_v_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
+                                       &vp9_high_v_predictor_8x8_c, 8, 8),
+                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
+                                       &vp9_high_v_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
+                                       &vp9_high_v_predictor_32x32_c, 32, 8),
+                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
+                                       &vp9_high_tm_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
+                                       &vp9_high_tm_predictor_8x8_c, 8, 8)));
+#else
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
+                                       &vp9_high_dc_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
+                                       &vp9_high_dc_predictor_8x8_c, 8, 8),
+                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
+                                       &vp9_high_dc_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_high_v_predictor_4x4_sse,
+                                       &vp9_high_v_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
+                                       &vp9_high_v_predictor_8x8_c, 8, 8),
+                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
+                                       &vp9_high_v_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
+                                       &vp9_high_v_predictor_32x32_c, 32, 8),
+                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
+                                       &vp9_high_tm_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
+                                       &vp9_high_tm_predictor_8x8_c, 8, 8)));
+#endif
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vp9_high_dc_predictor_32x32_sse2,
+                                       &vp9_high_dc_predictor_32x32_c, 32, 10),
+                            make_tuple(&vp9_high_tm_predictor_16x16_sse2,
+                                       &vp9_high_tm_predictor_16x16_c, 16, 10),
+                            make_tuple(&vp9_high_tm_predictor_32x32_sse2,
+                                       &vp9_high_tm_predictor_32x32_c, 32, 10),
+                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
+                                       &vp9_high_dc_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
+                                       &vp9_high_dc_predictor_8x8_c, 8, 10),
+                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
+                                   &vp9_high_dc_predictor_16x16_c, 16, 10),
+                            make_tuple(&vp9_high_v_predictor_4x4_sse,
+                                       &vp9_high_v_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
+                                       &vp9_high_v_predictor_8x8_c, 8, 10),
+                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
+                                       &vp9_high_v_predictor_16x16_c, 16, 10),
+                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
+                                       &vp9_high_v_predictor_32x32_c, 32, 10),
+                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
+                                       &vp9_high_tm_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
+                                       &vp9_high_tm_predictor_8x8_c, 8, 10)));
+#else
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
+                                       &vp9_high_dc_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
+                                       &vp9_high_dc_predictor_8x8_c, 8, 10),
+                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
+                                       &vp9_high_dc_predictor_16x16_c, 16, 10),
+                            make_tuple(&vp9_high_v_predictor_4x4_sse,
+                                       &vp9_high_v_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
+                                       &vp9_high_v_predictor_8x8_c, 8, 10),
+                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
+                                       &vp9_high_v_predictor_16x16_c, 16, 10),
+                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
+                                       &vp9_high_v_predictor_32x32_c, 32, 10),
+                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
+                                   &vp9_high_tm_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
+                                       &vp9_high_tm_predictor_8x8_c, 8, 10)));
+#endif
+
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vp9_high_dc_predictor_32x32_sse2,
+                                       &vp9_high_dc_predictor_32x32_c, 32, 12),
+                            make_tuple(&vp9_high_tm_predictor_16x16_sse2,
+                                       &vp9_high_tm_predictor_16x16_c, 16, 12),
+                            make_tuple(&vp9_high_tm_predictor_32x32_sse2,
+                                       &vp9_high_tm_predictor_32x32_c, 32, 12),
+                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
+                                       &vp9_high_dc_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
+                                       &vp9_high_dc_predictor_8x8_c, 8, 12),
+                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
+                                       &vp9_high_dc_predictor_16x16_c, 16, 12),
+                            make_tuple(&vp9_high_v_predictor_4x4_sse,
+                                       &vp9_high_v_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
+                                       &vp9_high_v_predictor_8x8_c, 8, 12),
+                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
+                                       &vp9_high_v_predictor_16x16_c, 16, 12),
+                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
+                                       &vp9_high_v_predictor_32x32_c, 32, 12),
+                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
+                                       &vp9_high_tm_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
+                                       &vp9_high_tm_predictor_8x8_c, 8, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
+                                       &vp9_high_dc_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
+                                       &vp9_high_dc_predictor_8x8_c, 8, 12),
+                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
+                                       &vp9_high_dc_predictor_16x16_c, 16, 12),
+                            make_tuple(&vp9_high_v_predictor_4x4_sse,
+                                       &vp9_high_v_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
+                                       &vp9_high_v_predictor_8x8_c, 8, 12),
+                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
+                                       &vp9_high_v_predictor_16x16_c, 16, 12),
+                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
+                                       &vp9_high_v_predictor_32x32_c, 32, 12),
+                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
+                                       &vp9_high_tm_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
+                                       &vp9_high_tm_predictor_8x8_c, 8, 12)));
+#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 5587192..8305e7f 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -65,6 +65,18 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
+static INLINE uint16_t clip_pixel_high(int val, int bd) {
+  switch (bd) {
+    case 8:
+    default:
+      return (uint16_t)clamp(val, 0, 255);
+    case 10:
+      return (uint16_t)clamp(val, 0, 1023);
+    case 12:
+      return (uint16_t)clamp(val, 0, 4095);
+  }
+}
+
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 ))
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c
index 733b3a9..34795b7 100644
--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c
@@ -61,6 +61,10 @@
     if (!int_fb_list->int_fb[i].data)
       return -1;
 
+    // This memset is needed for fixing valgrind error from C loop filter
+    // due to access uninitialized memory in frame border. It could be
+    // removed if border is totally removed.
+    vpx_memset(int_fb_list->int_fb[i].data, 0, min_size);
     int_fb_list->int_fb[i].size = min_size;
   }
 
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 3332e58..564a3eb 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -47,6 +47,78 @@
   1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
 };
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
+  4,     9,    10,    13,    15,    17,    20,    22,
+  25,    28,    31,    34,    37,    40,    43,    47,
+  50,    53,    57,    60,    64,    68,    71,    75,
+  78,    82,    86,    90,    93,    97,   101,   105,
+  109,   113,   116,   120,   124,   128,   132,   136,
+  140,   143,   147,   151,   155,   159,   163,   166,
+  170,   174,   178,   182,   185,   189,   193,   197,
+  200,   204,   208,   212,   215,   219,   223,   226,
+  230,   233,   237,   241,   244,   248,   251,   255,
+  259,   262,   266,   269,   273,   276,   280,   283,
+  287,   290,   293,   297,   300,   304,   307,   310,
+  314,   317,   321,   324,   327,   331,   334,   337,
+  343,   350,   356,   362,   369,   375,   381,   387,
+  394,   400,   406,   412,   418,   424,   430,   436,
+  442,   448,   454,   460,   466,   472,   478,   484,
+  490,   499,   507,   516,   525,   533,   542,   550,
+  559,   567,   576,   584,   592,   601,   609,   617,
+  625,   634,   644,   655,   666,   676,   687,   698,
+  708,   718,   729,   739,   749,   759,   770,   782,
+  795,   807,   819,   831,   844,   856,   868,   880,
+  891,   906,   920,   933,   947,   961,   975,   988,
+  1001,  1015,  1030,  1045,  1061,  1076,  1090,  1105,
+  1120,  1137,  1153,  1170,  1186,  1202,  1218,  1236,
+  1253,  1271,  1288,  1306,  1323,  1342,  1361,  1379,
+  1398,  1416,  1436,  1456,  1476,  1496,  1516,  1537,
+  1559,  1580,  1601,  1624,  1647,  1670,  1692,  1717,
+  1741,  1766,  1791,  1817,  1844,  1871,  1900,  1929,
+  1958,  1990,  2021,  2054,  2088,  2123,  2159,  2197,
+  2236,  2276,  2319,  2363,  2410,  2458,  2508,  2561,
+  2616,  2675,  2737,  2802,  2871,  2944,  3020,  3102,
+  3188,  3280,  3375,  3478,  3586,  3702,  3823,  3953,
+  4089,  4236,  4394,  4559,  4737,  4929,  5130,  5347,
+};
+
+static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
+  4,    12,    18,    25,    33,    41,    50,    60,
+  70,    80,    91,   103,   115,   127,   140,   153,
+  166,   180,   194,   208,   222,   237,   251,   266,
+  281,   296,   312,   327,   343,   358,   374,   390,
+  405,   421,   437,   453,   469,   484,   500,   516,
+  532,   548,   564,   580,   596,   611,   627,   643,
+  659,   674,   690,   706,   721,   737,   752,   768,
+  783,   798,   814,   829,   844,   859,   874,   889,
+  904,   919,   934,   949,   964,   978,   993,  1008,
+  1022,  1037,  1051,  1065,  1080,  1094,  1108,  1122,
+  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+  1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,
+  1368,  1393,  1419,  1444,  1469,  1494,  1519,  1544,
+  1569,  1594,  1618,  1643,  1668,  1692,  1717,  1741,
+  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,
+  1957,  1992,  2027,  2061,  2096,  2130,  2165,  2199,
+  2233,  2267,  2300,  2334,  2367,  2400,  2434,  2467,
+  2499,  2532,  2575,  2618,  2661,  2704,  2746,  2788,
+  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,
+  3177,  3226,  3275,  3324,  3373,  3421,  3469,  3517,
+  3565,  3621,  3677,  3733,  3788,  3843,  3897,  3951,
+  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+  4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,
+  5013,  5083,  5153,  5222,  5291,  5367,  5442,  5517,
+  5591,  5665,  5745,  5825,  5905,  5984,  6063,  6149,
+  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,
+  6966,  7064,  7163,  7269,  7376,  7483,  7599,  7715,
+  7832,  7958,  8085,  8214,  8352,  8492,  8635,  8788,
+  8945,  9104,  9275,  9450,  9639,  9832, 10031, 10245,
+  10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+  12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+  16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387,
+};
+#endif
+
 static const int16_t ac_qlookup[QINDEX_RANGE] = {
   4,       8,    9,   10,   11,   12,   13,   14,
   15,     16,   17,   18,   19,   20,   21,   22,
@@ -82,15 +154,116 @@
   1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };
 
-int16_t vp9_dc_quant(int qindex, int delta) {
+#if CONFIG_VP9_HIGHBITDEPTH
+static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
+  4,     9,    11,    13,    16,    18,    21,    24,
+  27,    30,    33,    37,    40,    44,    48,    51,
+  55,    59,    63,    67,    71,    75,    79,    83,
+  88,    92,    96,   100,   105,   109,   114,   118,
+  122,   127,   131,   136,   140,   145,   149,   154,
+  158,   163,   168,   172,   177,   181,   186,   190,
+  195,   199,   204,   208,   213,   217,   222,   226,
+  231,   235,   240,   244,   249,   253,   258,   262,
+  267,   271,   275,   280,   284,   289,   293,   297,
+  302,   306,   311,   315,   319,   324,   328,   332,
+  337,   341,   345,   349,   354,   358,   362,   367,
+  371,   375,   379,   384,   388,   392,   396,   401,
+  409,   417,   425,   433,   441,   449,   458,   466,
+  474,   482,   490,   498,   506,   514,   523,   531,
+  539,   547,   555,   563,   571,   579,   588,   596,
+  604,   616,   628,   640,   652,   664,   676,   688,
+  700,   713,   725,   737,   749,   761,   773,   785,
+  797,   809,   825,   841,   857,   873,   889,   905,
+  922,   938,   954,   970,   986,  1002,  1018,  1038,
+  1058,  1078,  1098,  1118,  1138,  1158,  1178,  1198,
+  1218,  1242,  1266,  1290,  1314,  1338,  1362,  1386,
+  1411,  1435,  1463,  1491,  1519,  1547,  1575,  1603,
+  1631,  1663,  1695,  1727,  1759,  1791,  1823,  1859,
+  1895,  1931,  1967,  2003,  2039,  2079,  2119,  2159,
+  2199,  2239,  2283,  2327,  2371,  2415,  2459,  2507,
+  2555,  2603,  2651,  2703,  2755,  2807,  2859,  2915,
+  2971,  3027,  3083,  3143,  3203,  3263,  3327,  3391,
+  3455,  3523,  3591,  3659,  3731,  3803,  3876,  3952,
+  4028,  4104,  4184,  4264,  4348,  4432,  4516,  4604,
+  4692,  4784,  4876,  4972,  5068,  5168,  5268,  5372,
+  5476,  5584,  5692,  5804,  5916,  6032,  6148,  6268,
+  6388,  6512,  6640,  6768,  6900,  7036,  7172,  7312,
+};
+
+static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
+  4,    13,    19,    27,    35,    44,    54,    64,
+  75,    87,    99,   112,   126,   139,   154,   168,
+  183,   199,   214,   230,   247,   263,   280,   297,
+  314,   331,   349,   366,   384,   402,   420,   438,
+  456,   475,   493,   511,   530,   548,   567,   586,
+  604,   623,   642,   660,   679,   698,   716,   735,
+  753,   772,   791,   809,   828,   846,   865,   884,
+  902,   920,   939,   957,   976,   994,  1012,  1030,
+  1049,  1067,  1085,  1103,  1121,  1139,  1157,  1175,
+  1193,  1211,  1229,  1246,  1264,  1282,  1299,  1317,
+  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+  1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,
+  1627,  1660,  1693,  1725,  1758,  1791,  1824,  1856,
+  1889,  1922,  1954,  1987,  2020,  2052,  2085,  2118,
+  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,
+  2411,  2459,  2508,  2556,  2605,  2653,  2701,  2750,
+  2798,  2847,  2895,  2943,  2992,  3040,  3088,  3137,
+  3185,  3234,  3298,  3362,  3426,  3491,  3555,  3619,
+  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,
+  4230,  4310,  4390,  4470,  4550,  4631,  4711,  4791,
+  4871,  4967,  5064,  5160,  5256,  5352,  5448,  5544,
+  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+  6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,
+  7579,  7723,  7867,  8011,  8155,  8315,  8475,  8635,
+  8795,  8956,  9132,  9308,  9484,  9660,  9836, 10028,
+  10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+  11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+  13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+  16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+  18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+  21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+  25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247,
+};
+#endif
+
+int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_10:
+      return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_12:
+      return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
   return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
 }
 
-int16_t vp9_ac_quant(int qindex, int delta) {
+int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_10:
+      return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_12:
+      return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
   return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
 }
 
-
 int vp9_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h
index d1545d9..b626605 100644
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -11,6 +11,7 @@
 #ifndef VP9_COMMON_VP9_QUANT_COMMON_H_
 #define VP9_COMMON_VP9_QUANT_COMMON_H_
 
+#include "vpx/vpx_codec.h"
 #include "vp9/common/vp9_blockd.h"
 
 #ifdef __cplusplus
@@ -22,8 +23,8 @@
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8
 
-int16_t vp9_dc_quant(int qindex, int delta);
-int16_t vp9_ac_quant(int qindex, int delta);
+int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
+int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
 
 int vp9_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 471929a..0fd8d0c 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -40,11 +40,291 @@
     type##_predictor(dst, stride, size, above, left); \
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#define intra_pred_high_sized(type, size) \
+  void vp9_high_##type##_predictor_##size##x##size##_c( \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+      const uint16_t *left, int bd) { \
+    high_##type##_predictor(dst, stride, size, above, left, bd); \
+  }
+
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_high_sized(type, 4) \
+  intra_pred_high_sized(type, 8) \
+  intra_pred_high_sized(type, 16) \
+  intra_pred_high_sized(type, 32)
+
+#else
+
 #define intra_pred_allsizes(type) \
   intra_pred_sized(type, 4) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void high_d207_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void) above;
+  (void) bd;
+  int r, c;
+
+  // First column.
+  for (r = 0; r < bs - 1; ++r) {
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1);
+  }
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // Second column.
+  for (r = 0; r < bs - 2; ++r) {
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 +
+                                         left[r + 2], 2);
+  }
+  dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] +
+                                              left[bs - 1] * 3, 2);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // Rest of last row.
+  for (c = 0; c < bs - 2; ++c)
+    dst[(bs - 1) * stride + c] = left[bs - 1];
+
+  for (r = bs - 2; r >= 0; --r) {
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+  }
+}
+
+static INLINE void high_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void) left;
+  (void) bd;
+  int r, c;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] +
+                                          above[r/2 + c + 1] * 2 +
+                                          above[r/2 + c + 2], 2)
+                     : ROUND_POWER_OF_TWO(above[r/2 + c] +
+                                          above[r/2 + c + 1], 1);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void high_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void) left;
+  (void) bd;
+  int r, c;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r + c + 2 < bs * 2 ?  ROUND_POWER_OF_TWO(above[r + c] +
+                                                        above[r + c + 1] * 2 +
+                                                        above[r + c + 2], 2)
+                                  : above[bs * 2 - 1];
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void high_d117_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void) bd;
+  int r, c;
+
+  // first row
+  for (c = 0; c < bs; c++)
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1);
+  dst += stride;
+
+  // second row
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
+  for (c = 1; c < bs; c++)
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
+  dst += stride;
+
+  // the rest of first col
+  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
+  for (r = 3; r < bs; ++r)
+    dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 +
+                                               left[r - 1], 2);
+
+  // the rest of the block
+  for (r = 2; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-2 * stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void high_d135_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void) bd;
+  int r, c;
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
+  for (c = 1; c < bs; c++)
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
+
+  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
+  for (r = 2; r < bs; ++r)
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
+                                         left[r], 2);
+
+  dst += stride;
+  for (r = 1; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void high_d153_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void) bd;
+  int r, c;
+  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1);
+  for (r = 1; r < bs; r++)
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1);
+  dst++;
+
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
+  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
+  for (r = 2; r < bs; r++)
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
+                                         left[r], 2);
+  dst++;
+
+  for (c = 0; c < bs - 2; c++)
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2);
+  dst += stride;
+
+  for (r = 1; r < bs; ++r) {
+    for (c = 0; c < bs - 2; c++)
+      dst[c] = dst[-stride + c - 2];
+    dst += stride;
+  }
+}
+
+static INLINE void high_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above,
+                                    const uint16_t *left, int bd) {
+  (void) left;
+  (void) bd;
+  int r;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memcpy(dst, above, bs * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void high_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  (void) above;
+  (void) bd;
+  int r;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static INLINE void high_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void) bd;
+  int r, c;
+  int ytop_left = above[-1];
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++)
+      dst[c] = clip_pixel_high(left[r] + above[c] - ytop_left, bd);
+    dst += stride;
+  }
+}
+
+static INLINE void high_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  (void) above;
+  (void) left;
+  int r;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, 128 << (bd - 8), bs);
+    dst += stride;
+  }
+}
+
+static INLINE void high_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void) above;
+  (void) bd;
+  int i, r, expected_dc, sum = 0;
+
+  for (i = 0; i < bs; i++)
+    sum += left[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void high_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  (void) left;
+  (void) bd;
+  int i, r, expected_dc, sum = 0;
+
+  for (i = 0; i < bs; i++)
+    sum += above[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void high_dc_predictor(uint16_t *dst, ptrdiff_t stride,
+                                     int bs, const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void) bd;
+  int i, r, expected_dc, sum = 0;
+  const int count = 2 * bs;
+
+  for (i = 0; i < bs; i++) {
+    sum += above[i];
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
@@ -293,6 +573,14 @@
 static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
 static intra_pred_fn dc_pred[2][2][TX_SIZES];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][4];
+static intra_high_pred_fn dc_pred_high[2][2][4];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_init_intra_predictors() {
 #define INIT_ALL_SIZES(p, type) \
   p[TX_4X4] = vp9_##type##_predictor_4x4; \
@@ -315,9 +603,164 @@
   INIT_ALL_SIZES(dc_pred[1][0], dc_left);
   INIT_ALL_SIZES(dc_pred[1][1], dc);
 
-#undef INIT_ALL_SIZES
+#if CONFIG_VP9_HIGHBITDEPTH
+  INIT_ALL_SIZES(pred_high[V_PRED], high_v);
+  INIT_ALL_SIZES(pred_high[H_PRED], high_h);
+  INIT_ALL_SIZES(pred_high[D207_PRED], high_d207);
+  INIT_ALL_SIZES(pred_high[D45_PRED], high_d45);
+  INIT_ALL_SIZES(pred_high[D63_PRED], high_d63);
+  INIT_ALL_SIZES(pred_high[D117_PRED], high_d117);
+  INIT_ALL_SIZES(pred_high[D135_PRED], high_d135);
+  INIT_ALL_SIZES(pred_high[D153_PRED], high_d153);
+  INIT_ALL_SIZES(pred_high[TM_PRED], high_tm);
+
+  INIT_ALL_SIZES(dc_pred_high[0][0], high_dc_128);
+  INIT_ALL_SIZES(dc_pred_high[0][1], high_dc_top);
+  INIT_ALL_SIZES(dc_pred_high[1][0], high_dc_left);
+  INIT_ALL_SIZES(dc_pred_high[1][1], high_dc);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#undef intra_pred_allsizes
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void build_intra_predictors_high(const MACROBLOCKD *xd,
+                                        const uint8_t *ref8,
+                                        int ref_stride,
+                                        uint8_t *dst8,
+                                        int dst_stride,
+                                        PREDICTION_MODE mode,
+                                        TX_SIZE tx_size,
+                                        int up_available,
+                                        int left_available,
+                                        int right_available,
+                                        int x, int y,
+                                        int plane, int bd) {
+  int i;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 64);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 128 + 16);
+  uint16_t *above_row = above_data + 16;
+  const uint16_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  //  int base=128;
+  int base = 128 << (bd - 8);
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  // left
+  if (left_available) {
+    if (xd->mb_to_bottom_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (y0 + bs <= frame_height) {
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      } else {
+        const int extend_bottom = frame_height - y0;
+        for (i = 0; i < extend_bottom; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+        for (; i < bs; ++i)
+          left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+      }
+    } else {
+      /* faster path if the block does not need extension */
+      for (i = 0; i < bs; ++i)
+        left_col[i] = ref[i * ref_stride - 1];
+    }
+  } else {
+    // TODO(Peter): this value should probably change for high bitdepth
+    vpx_memset16(left_col, base + 1, bs);
+  }
+
+  // TODO(hkuang) do not extend 2*bs pixels for all modes.
+  // above
+  if (up_available) {
+    const uint16_t *above_ref = ref - ref_stride;
+    if (xd->mb_to_right_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (x0 + 2 * bs <= frame_width) {
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, 2 * bs * sizeof(uint16_t));
+        } else {
+          vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 + bs <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
+          vpx_memset16(above_row + r, above_row[r - 1],
+                       x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
+          vpx_memset16(above_row + r, above_row[r - 1],
+                       x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
+          vpx_memset16(above_row + r, above_row[r - 1],
+                       x0 + 2 * bs - frame_width);
+        }
+      }
+      // TODO(Peter) this value should probably change for high bitdepth
+      above_row[-1] = left_available ? above_ref[-1] : (base+1);
+    } else {
+      /* faster path if the block does not need extension */
+      if (bs == 4 && right_available && left_available) {
+        const_above_row = above_ref;
+      } else {
+        vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+        if (bs == 4 && right_available)
+          vpx_memcpy(above_row + bs, above_ref + bs, bs * sizeof(uint16_t));
+        else
+          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+        // TODO(Peter): this value should probably change for high bitdepth
+        above_row[-1] = left_available ? above_ref[-1] : (base+1);
+      }
+    }
+  } else {
+    vpx_memset16(above_row, base - 1, bs * 2);
+    // TODO(Peter): this value should probably change for high bitdepth
+    above_row[-1] = base - 1;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred_high[left_available][up_available][tx_size](dst, dst_stride,
+                                                        const_above_row,
+                                                        left_col, xd->bd);
+  } else {
+    pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
+                             xd->bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int ref_stride, uint8_t *dst, int dst_stride,
                                    PREDICTION_MODE mode, TX_SIZE tx_size,
@@ -454,6 +897,14 @@
   const int y = loff * 4;
 
   assert(bwl >= 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
+                                tx_size, have_top, have_left, have_right,
+                                x, y, plane, xd->bd);
+    return;
+  }
+#endif
   build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
                          have_top, have_left, have_right, x, y, plane);
 }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 32bcf9a..b75ea64 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -445,61 +445,219 @@
   specialize qw/vp9_iwht4x4_16_add/;
 }
 
-
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-#
-# dct
-#
-add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct4x4_1_add/;
+  #
+  # Intra prediction
+  #
+  add_proto qw/void vp9_high_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d207_predictor_4x4/;
 
-add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct4x4_16_add/;
+  add_proto qw/void vp9_high_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d45_predictor_4x4/;
 
-add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct8x8_1_add/;
+  add_proto qw/void vp9_high_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d63_predictor_4x4/;
 
-add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct8x8_64_add/;
+  add_proto qw/void vp9_high_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_h_predictor_4x4/;
 
-add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct8x8_10_add/;
+  add_proto qw/void vp9_high_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d117_predictor_4x4/;
 
-add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct16x16_1_add/;
+  add_proto qw/void vp9_high_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d135_predictor_4x4/;
 
-add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct16x16_256_add/;
+  add_proto qw/void vp9_high_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d153_predictor_4x4/;
 
-add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct16x16_10_add/;
+  add_proto qw/void vp9_high_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_v_predictor_4x4 neon/, "$sse_x86inc";
 
-add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct32x32_1024_add/;
+  add_proto qw/void vp9_high_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_tm_predictor_4x4/, "$sse_x86inc";
 
-add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct32x32_34_add/;
+  add_proto qw/void vp9_high_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_predictor_4x4/, "$sse_x86inc";
 
-add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_idct32x32_1_add/;
+  add_proto qw/void vp9_high_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_top_predictor_4x4/;
 
-add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
-specialize qw/vp9_high_iht4x4_16_add/;
+  add_proto qw/void vp9_high_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_left_predictor_4x4/;
 
-add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
-specialize qw/vp9_high_iht8x8_64_add/;
+  add_proto qw/void vp9_high_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_128_predictor_4x4/;
 
-add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
-specialize qw/vp9_high_iht16x16_256_add/;
+  add_proto qw/void vp9_high_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d207_predictor_8x8/;
 
-# dct and add
+  add_proto qw/void vp9_high_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d45_predictor_8x8/;
 
-add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_iwht4x4_1_add/;
+  add_proto qw/void vp9_high_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d63_predictor_8x8/;
 
-add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-specialize qw/vp9_high_iwht4x4_16_add/;
+  add_proto qw/void vp9_high_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_h_predictor_8x8/;
+
+  add_proto qw/void vp9_high_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d117_predictor_8x8/;
+
+  add_proto qw/void vp9_high_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d135_predictor_8x8/;
+
+  add_proto qw/void vp9_high_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d153_predictor_8x8/;
+
+  add_proto qw/void vp9_high_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_v_predictor_8x8/, "$sse2_x86inc";
+
+  add_proto qw/void vp9_high_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_tm_predictor_8x8/, "$sse2_x86inc";
+
+  add_proto qw/void vp9_high_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_predictor_8x8/, "$sse2_x86inc";;
+
+  add_proto qw/void vp9_high_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_top_predictor_8x8/;
+
+  add_proto qw/void vp9_high_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_left_predictor_8x8/;
+
+  add_proto qw/void vp9_high_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_128_predictor_8x8/;
+
+  add_proto qw/void vp9_high_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d207_predictor_16x16/;
+
+  add_proto qw/void vp9_high_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d45_predictor_16x16/;
+
+  add_proto qw/void vp9_high_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d63_predictor_16x16/;
+
+  add_proto qw/void vp9_high_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_h_predictor_16x16/;
+
+  add_proto qw/void vp9_high_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d117_predictor_16x16/;
+
+  add_proto qw/void vp9_high_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d135_predictor_16x16/;
+
+  add_proto qw/void vp9_high_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d153_predictor_16x16/;
+
+  add_proto qw/void vp9_high_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_v_predictor_16x16 neon/, "$sse2_x86inc";
+
+  add_proto qw/void vp9_high_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_tm_predictor_16x16/, "$sse2_x86_64";
+
+  add_proto qw/void vp9_high_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_predictor_16x16/, "$sse2_x86inc";
+
+  add_proto qw/void vp9_high_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_top_predictor_16x16/;
+
+  add_proto qw/void vp9_high_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_left_predictor_16x16/;
+
+  add_proto qw/void vp9_high_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_128_predictor_16x16/;
+
+  add_proto qw/void vp9_high_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d207_predictor_32x32/;
+
+  add_proto qw/void vp9_high_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d45_predictor_32x32/;
+
+  add_proto qw/void vp9_high_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d63_predictor_32x32/;
+
+  add_proto qw/void vp9_high_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_h_predictor_32x32/;
+
+  add_proto qw/void vp9_high_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d117_predictor_32x32/;
+
+  add_proto qw/void vp9_high_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d135_predictor_32x32/;
+
+  add_proto qw/void vp9_high_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_d153_predictor_32x32/;
+
+  add_proto qw/void vp9_high_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_v_predictor_32x32/, "$sse2_x86inc";
+
+  add_proto qw/void vp9_high_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_tm_predictor_32x32/, "$sse2_x86_64";
+
+  add_proto qw/void vp9_high_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_predictor_32x32/, "$sse2_x86_64";
+
+  add_proto qw/void vp9_high_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_top_predictor_32x32/;
+
+  add_proto qw/void vp9_high_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_left_predictor_32x32/;
+
+  add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  specialize qw/vp9_high_dc_128_predictor_32x32/;
+
+  #
+  # dct
+  #
+  add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct4x4_1_add/;
+
+  add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct4x4_16_add/;
+
+  add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct8x8_1_add/;
+
+  add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct8x8_64_add/;
+
+  add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct8x8_10_add/;
+
+  add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct16x16_1_add/;
+
+  add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct16x16_256_add/;
+
+  add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct16x16_10_add/;
+
+  add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct32x32_1024_add/;
+
+  add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct32x32_34_add/;
+
+  add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_idct32x32_1_add/;
+
+  add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp9_high_iht4x4_16_add/;
+
+  add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp9_high_iht8x8_64_add/;
+
+  add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+  specialize qw/vp9_high_iht16x16_256_add/;
+
+  # dct and add
+
+  add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_iwht4x4_1_add/;
+
+  add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_high_iwht4x4_16_add/;
 }
 
 #
@@ -838,7 +996,7 @@
   specialize qw/vp9_quantize_b_32x32/;
 } else {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_block_error avx2/;
+  specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
 
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
@@ -850,7 +1008,7 @@
   specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b_32x32/;
+  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
 }
 
 #
diff --git a/vp9/common/x86/vp9_high_intrapred_sse2.asm b/vp9/common/x86/vp9_high_intrapred_sse2.asm
new file mode 100644
index 0000000..ff45071
--- /dev/null
+++ b/vp9/common/x86/vp9_high_intrapred_sse2.asm
@@ -0,0 +1,476 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_MMX sse
+cglobal high_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, one
+  mov                 oned, 0x0001
+  pxor                  m1, m1
+  movd                  m3, oned
+  pshufw                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal high_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, one
+  mov                 oned, 0x00010001
+  lea             stride3q, [strideq*3]
+  movd                  m3, oned
+  pshufd                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_8)]
+  psrlw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal high_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m3, [aboveq+16]
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_16)]
+  psrad                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova   [dstq              ], m0
+  mova   [dstq           +16], m0
+  mova   [dstq+strideq*2    ], m0
+  mova   [dstq+strideq*2 +16], m0
+  mova   [dstq+strideq*4    ], m0
+  mova   [dstq+strideq*4 +16], m0
+  mova   [dstq+stride3q*2   ], m0
+  mova   [dstq+stride3q*2+16], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal high_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  mova                  m5, [leftq]
+  mova                  m6, [leftq+16]
+  mova                  m7, [leftq+32]
+  mova                  m8, [leftq+48]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  paddw                 m0, m5
+  paddw                 m0, m6
+  paddw                 m0, m7
+  paddw                 m0, m8
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_32)]
+  psrad                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova [dstq               ], m0
+  mova [dstq          +16  ], m0
+  mova [dstq          +32  ], m0
+  mova [dstq          +48  ], m0
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2+16  ], m0
+  mova [dstq+strideq*2+32  ], m0
+  mova [dstq+strideq*2+48  ], m0
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4+16  ], m0
+  mova [dstq+strideq*4+32  ], m0
+  mova [dstq+strideq*4+48  ], m0
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m0
+  mova [dstq+stride3q*2 +32], m0
+  mova [dstq+stride3q*2 +48], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+%endif
+
+INIT_MMX sse
+cglobal high_v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal high_v_predictor_8x8, 3, 3, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal high_v_predictor_16x16, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq              ], m0
+  mova    [dstq           +16], m1
+  mova    [dstq+strideq*2    ], m0
+  mova    [dstq+strideq*2 +16], m1
+  mova    [dstq+strideq*4    ], m0
+  mova    [dstq+strideq*4 +16], m1
+  mova    [dstq+stride3q*2   ], m0
+  mova    [dstq+stride3q*2+16], m1
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal high_v_predictor_32x32, 3, 4, 4, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  mova                  m2, [aboveq+32]
+  mova                  m3, [aboveq+48]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq            +32], m2
+  mova [dstq            +48], m3
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2  +16], m1
+  mova [dstq+strideq*2  +32], m2
+  mova [dstq+strideq*2  +48], m3
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4  +16], m1
+  mova [dstq+strideq*4  +32], m2
+  mova [dstq+strideq*4  +48], m3
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m1
+  mova [dstq+stride3q*2 +32], m2
+  mova [dstq+stride3q*2 +48], m3
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_MMX sse
+cglobal high_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+  movd                  m1, [aboveq-2]
+  movq                  m0, [aboveq]
+  pshufw                m1, m1, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  movd                  m3, oned
+  movd                  m4, bpsd
+  pshufw                m3, m3, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -2
+  mova                  m2, m3
+  psllw                 m3, m4
+  add                leftq, 8
+  psubw                 m3, m2 ; max possible value
+  pxor                  m4, m4 ; min possible value
+  psubw                 m0, m1
+.loop:
+  movq                  m1, [leftq+lineq*4]
+  movq                  m2, [leftq+lineq*4+2]
+  pshufw                m1, m1, 0x0
+  pshufw                m2, m2, 0x0
+  paddw                 m1, m0
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m1, m3
+  pminsw                m2, m3
+  pmaxsw                m1, m4
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m1
+  movq    [dstq+strideq*2], m2
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal high_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+  movd                  m1, [aboveq-2]
+  mova                  m0, [aboveq]
+  pshuflw               m1, m1, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                  m3, m3
+  pxor                  m4, m4
+  pinsrw                m3, oned, 0
+  pinsrw                m4, bpsd, 0
+  pshuflw               m3, m3, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq            m3, m3
+  mov                lineq, -4
+  mova                  m2, m3
+  punpcklqdq            m1, m1
+  psllw                 m3, m4
+  add                leftq, 16
+  psubw                 m3, m2 ; max possible value
+  pxor                  m4, m4 ; min possible value
+  psubw                 m0, m1
+.loop:
+  movd                  m1, [leftq+lineq*4]
+  movd                  m2, [leftq+lineq*4+2]
+  pshuflw               m1, m1, 0x0
+  pshuflw               m2, m2, 0x0
+  punpcklqdq            m1, m1
+  punpcklqdq            m2, m2
+  paddw                 m1, m0
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m1, m3
+  pminsw                m2, m3
+  pmaxsw                m1, m4
+  pmaxsw                m2, m4
+  ;Store the values
+  mova      [dstq          ], m1
+  mova      [dstq+strideq*2], m2
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal high_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one
+  movd                  m2, [aboveq-2]
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  pshuflw               m2, m2, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                  m7, m7
+  pxor                  m8, m8
+  pinsrw                m7, oned, 0
+  pinsrw                m8, bpsd, 0
+  pshuflw               m7, m7, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq            m7, m7
+  mov                lineq, -8
+  mova                  m5, m7
+  punpcklqdq            m2, m2
+  psllw                 m7, m8
+  add                leftq, 32
+  psubw                 m7, m5 ; max possible value
+  pxor                  m8, m8 ; min possible value
+  psubw                 m0, m2
+  psubw                 m1, m2
+.loop:
+  movd                  m2, [leftq+lineq*4]
+  movd                  m3, [leftq+lineq*4+2]
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m4, m2, m0
+  paddw                 m5, m3, m0
+  paddw                 m2, m1
+  paddw                 m3, m1
+  ;Clamp to the bit-depth
+  pminsw                m4, m7
+  pminsw                m5, m7
+  pminsw                m2, m7
+  pminsw                m3, m7
+  pmaxsw                m4, m8
+  pmaxsw                m5, m8
+  pmaxsw                m2, m8
+  pmaxsw                m3, m8
+  ;Store the values
+  mova   [dstq             ], m4
+  mova   [dstq+strideq*2   ], m5
+  mova   [dstq          +16], m2
+  mova   [dstq+strideq*2+16], m3
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal high_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
+  movd                  m0, [aboveq-2]
+  mova                  m1, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  pshuflw               m0, m0, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                 m10, m10
+  pxor                 m11, m11
+  pinsrw               m10, oned, 0
+  pinsrw               m11, bpsd, 0
+  pshuflw              m10, m10, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq           m10, m10
+  mov                lineq, -16
+  mova                  m5, m10
+  punpcklqdq            m0, m0
+  psllw                m10, m11
+  add                leftq, 64
+  psubw                m10, m5 ; max possible value
+  pxor                 m11, m11 ; min possible value
+  psubw                 m1, m0
+  psubw                 m2, m0
+  psubw                 m3, m0
+  psubw                 m4, m0
+.loop:
+  movd                  m5, [leftq+lineq*4]
+  movd                  m6, [leftq+lineq*4+2]
+  pshuflw               m5, m5, 0x0
+  pshuflw               m6, m6, 0x0
+  punpcklqdq            m5, m5
+  punpcklqdq            m6, m6
+  paddw                 m7, m5, m1
+  paddw                 m8, m5, m2
+  paddw                 m9, m5, m3
+  paddw                 m5, m4
+  ;Clamp these values to the bit-depth
+  pminsw                m7, m10
+  pminsw                m8, m10
+  pminsw                m9, m10
+  pminsw                m5, m10
+  pmaxsw                m7, m11
+  pmaxsw                m8, m11
+  pmaxsw                m9, m11
+  pmaxsw                m5, m11
+  ;Store these values
+  mova   [dstq           ], m7
+  mova   [dstq        +16], m8
+  mova   [dstq        +32], m9
+  mova   [dstq        +48], m5
+  paddw                 m7, m6, m1
+  paddw                 m8, m6, m2
+  paddw                 m9, m6, m3
+  paddw                 m6, m4
+  ;Clamp these values to the bit-depth
+  pminsw                m7, m10
+  pminsw                m8, m10
+  pminsw                m9, m10
+  pminsw                m6, m10
+  pmaxsw                m7, m11
+  pmaxsw                m8, m11
+  pmaxsw                m9, m11
+  pmaxsw                m6, m11
+  ;Store these values
+  mova   [dstq+strideq*2   ], m7
+  mova   [dstq+strideq*2+16], m8
+  mova   [dstq+strideq*2+32], m9
+  mova   [dstq+strideq*2+48], m6
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+%endif
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 7615cdd..f99fa7a 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1366,11 +1366,11 @@
   int q;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
-    cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q);
-    cm->y_dequant[q][1] = vp9_ac_quant(q, 0);
+    cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant[q][1] = vp9_ac_quant(q, 0, cm->bit_depth);
 
-    cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q);
-    cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q);
+    cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth);
+    cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
   }
 }
 
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 76ca1ae..df46f64 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -13,6 +13,7 @@
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
 
 #include "vp9/decoder/vp9_detokenize.h"
 
@@ -31,29 +32,31 @@
 #define INCREMENT_COUNT(token)                              \
   do {                                                      \
      if (!cm->frame_parallel_decoding_mode)                 \
-       ++coef_counts[band][ctx][token];                     \
+       ++coef_counts[band][ctx][token];                      \
   } while (0)
 
-#define WRITE_COEF_CONTINUE(val, token)                  \
-  {                                                      \
-    v = (val * dqv) >> dq_shift;                         \
-    dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;         \
-    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
-    ++c;                                                 \
-    ctx = get_coef_context(nb, token_cache, c);          \
-    dqv = dq[1];                                         \
-    continue;                                            \
-  }
+static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
+  int i, val = 0;
+  for (i = 0; i < n; ++i)
+    val = (val << 1) | vp9_read(r, probs[i]);
+  return val;
+}
 
-#define ADJUST_COEF(prob, bits_count)                   \
-  do {                                                  \
-    val += (vp9_read(r, prob) << bits_count);           \
-  } while (0)
+static const vp9_tree_index coeff_subtree_high[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                         /* 0 = LOW_VAL */
+  -TWO_TOKEN, 4,                                /* 1 = TWO */
+  -THREE_TOKEN, -FOUR_TOKEN,                    /* 2 = THREE */
+  8, 10,                                        /* 3 = HIGH_LOW */
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,           /* 4 = CAT_ONE */
+  12, 14,                                       /* 5 = CAT_THREEFOUR */
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,           /* 6 = CAT_THREE */
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN            /* 7 = CAT_FIVE */
+};
 
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
-                       tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
-                       int ctx, const int16_t *scan, const int16_t *nb,
-                       vp9_reader *r) {
+                        tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        vp9_reader *r) {
   const int max_eob = 16 << (tx_size << 1);
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
@@ -69,11 +72,11 @@
   uint8_t token_cache[32 * 32];
   const uint8_t *band_translate = get_band_translate(tx_size);
   const int dq_shift = (tx_size == TX_32X32);
-  int v;
+  int v, token;
   int16_t dqv = dq[0];
 
   while (c < max_eob) {
-    int val;
+    int val = -1;
     band = *band_translate++;
     prob = coef_probs[band][ctx];
     if (!cm->frame_parallel_decoding_mode)
@@ -95,81 +98,46 @@
       prob = coef_probs[band][ctx];
     }
 
-    // ONE_CONTEXT_NODE_0_
     if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
       INCREMENT_COUNT(ONE_TOKEN);
-      WRITE_COEF_CONTINUE(1, ONE_TOKEN);
-    }
-
-    INCREMENT_COUNT(TWO_TOKEN);
-
-    prob = vp9_pareto8_full[prob[PIVOT_NODE] - 1];
-
-    if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {
-      if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {
-        WRITE_COEF_CONTINUE(2, TWO_TOKEN);
+      token = ONE_TOKEN;
+      val = 1;
+    } else {
+      INCREMENT_COUNT(TWO_TOKEN);
+      token = vp9_read_tree(r, coeff_subtree_high,
+                            vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
+      switch (token) {
+        case TWO_TOKEN:
+        case THREE_TOKEN:
+        case FOUR_TOKEN:
+          val = token;
+          break;
+        case CATEGORY1_TOKEN:
+          val = CAT1_MIN_VAL + read_coeff(vp9_cat1_prob, 1, r);
+          break;
+        case CATEGORY2_TOKEN:
+          val = CAT2_MIN_VAL + read_coeff(vp9_cat2_prob, 2, r);
+          break;
+        case CATEGORY3_TOKEN:
+          val = CAT3_MIN_VAL + read_coeff(vp9_cat3_prob, 3, r);
+          break;
+        case CATEGORY4_TOKEN:
+          val = CAT4_MIN_VAL + read_coeff(vp9_cat4_prob, 4, r);
+          break;
+        case CATEGORY5_TOKEN:
+          val = CAT5_MIN_VAL + read_coeff(vp9_cat5_prob, 5, r);
+          break;
+        case CATEGORY6_TOKEN:
+          val = CAT6_MIN_VAL + read_coeff(vp9_cat6_prob, 14, r);
+          break;
       }
-      if (!vp9_read(r, prob[THREE_CONTEXT_NODE])) {
-        WRITE_COEF_CONTINUE(3, THREE_TOKEN);
-      }
-      WRITE_COEF_CONTINUE(4, FOUR_TOKEN);
     }
-
-    if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) {
-      if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) {
-        val = CAT1_MIN_VAL;
-        ADJUST_COEF(vp9_cat1_prob[0], 0);
-        WRITE_COEF_CONTINUE(val, CATEGORY1_TOKEN);
-      }
-      val = CAT2_MIN_VAL;
-      ADJUST_COEF(vp9_cat2_prob[0], 1);
-      ADJUST_COEF(vp9_cat2_prob[1], 0);
-      WRITE_COEF_CONTINUE(val, CATEGORY2_TOKEN);
-    }
-
-    if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
-      if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) {
-        val = CAT3_MIN_VAL;
-        ADJUST_COEF(vp9_cat3_prob[0], 2);
-        ADJUST_COEF(vp9_cat3_prob[1], 1);
-        ADJUST_COEF(vp9_cat3_prob[2], 0);
-        WRITE_COEF_CONTINUE(val, CATEGORY3_TOKEN);
-      }
-      val = CAT4_MIN_VAL;
-      ADJUST_COEF(vp9_cat4_prob[0], 3);
-      ADJUST_COEF(vp9_cat4_prob[1], 2);
-      ADJUST_COEF(vp9_cat4_prob[2], 1);
-      ADJUST_COEF(vp9_cat4_prob[3], 0);
-      WRITE_COEF_CONTINUE(val, CATEGORY4_TOKEN);
-    }
-
-    if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) {
-      val = CAT5_MIN_VAL;
-      ADJUST_COEF(vp9_cat5_prob[0], 4);
-      ADJUST_COEF(vp9_cat5_prob[1], 3);
-      ADJUST_COEF(vp9_cat5_prob[2], 2);
-      ADJUST_COEF(vp9_cat5_prob[3], 1);
-      ADJUST_COEF(vp9_cat5_prob[4], 0);
-      WRITE_COEF_CONTINUE(val, CATEGORY5_TOKEN);
-    }
-    val = 0;
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[0]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[1]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[2]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[3]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[4]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[5]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[6]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[7]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[8]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[9]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[10]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[11]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[12]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[13]);
-    val += CAT6_MIN_VAL;
-
-    WRITE_COEF_CONTINUE(val, CATEGORY6_TOKEN);
+    v = (val * dqv) >> dq_shift;
+    dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
+    ++c;
+    ctx = get_coef_context(nb, token_cache, c);
+    dqv = dq[1];
   }
 
   return c;
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 33f9239..f7fca0c 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -23,9 +23,9 @@
 static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
   {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
 
-static int get_aq_c_strength(int q_index) {
+static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
-  int base_quant = vp9_ac_quant(q_index, 0) / 4;
+  const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4;
   return (base_quant > 20) + (base_quant > 45);
 }
 
@@ -40,7 +40,7 @@
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     int segment;
-    const int aq_strength = get_aq_c_strength(cm->base_qindex);
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
     const int active_segments = aq_c_active_segments[aq_strength];
 
     // Clear down the segment map.
@@ -70,7 +70,8 @@
     for (segment = 1; segment < active_segments; ++segment) {
       int qindex_delta =
           vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                     aq_c_q_adj_factor[aq_strength][segment]);
+                                     aq_c_q_adj_factor[aq_strength][segment],
+                                     cm->bit_depth);
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -115,7 +116,7 @@
     // It is converted to bits * 256 units.
     const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
                             (bw * bh);
-    const int aq_strength = get_aq_c_strength(cm->base_qindex);
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
     const int active_segments = aq_c_active_segments[aq_strength];
 
     // The number of segments considered and the transition points used to
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index e7f0daa..514ff7a 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -200,7 +200,7 @@
 
     // Rate target ratio to set q delta.
     const float rate_ratio_qdelta = 2.0;
-    const double q = vp9_convert_qindex_to_q(cm->base_qindex);
+    const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
     vp9_clear_system_state();
     // Some of these parameters may be set via codec-control function later.
     cr->max_sbs_perframe = 10;
@@ -242,7 +242,8 @@
     // Set the q delta for segment 1.
     qindex_delta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
                                               cm->base_qindex,
-                                              rate_ratio_qdelta);
+                                              rate_ratio_qdelta,
+                                              cm->bit_depth);
     // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from
     // previous encoded frame.
     if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100)
diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c
index 56db95e..b96f00f 100644
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@@ -75,7 +75,7 @@
 void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
-  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex);
+  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
   const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
                                               cm->y_dc_delta_q);
   int i;
@@ -99,7 +99,8 @@
         continue;
       }
 
-      qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i));
+      qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i),
+                                        cm->bit_depth);
       vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
       vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b3884d0..9545ba0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -330,7 +330,8 @@
       seg->update_map = 1;
       seg->update_data = 1;
 
-      qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875);
+      qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+                                    cm->bit_depth);
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
 
@@ -351,7 +352,8 @@
         seg->update_data = 1;
         seg->abs_delta = SEGMENT_DELTADATA;
 
-        qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125);
+        qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+                                      cm->bit_depth);
         vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
         vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
@@ -2397,10 +2399,7 @@
 static void Pass2Encode(VP9_COMP *cpi, size_t *size,
                         uint8_t *dest, unsigned int *frame_flags) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
-
-  vp9_rc_get_second_pass_params(cpi);
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-
   vp9_twopass_postencode_update(cpi);
 }
 
@@ -2724,6 +2723,12 @@
   cm->frame_bufs[cm->new_fb_idx].ref_count--;
   cm->new_fb_idx = get_free_fb(cm);
 
+  // For two pass encodes analyse the first pass stats and determine
+  // the bit allocation and other parameters for this frame / group of frames.
+  if ((oxcf->pass == 2) && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    vp9_rc_get_second_pass_params(cpi);
+  }
+
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
       init_buffer_indices(cpi);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 54b57cf..df82be5 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -62,8 +62,8 @@
   *b = temp;
 }
 
-static int gfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
+static int gfboost_qadjust(int qindex, vpx_bit_depth_t bit_depth) {
+  const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
   return (int)((0.00000828 * q * q * q) +
                (-0.0055 * q * q) +
                (1.32 * q) + 79.3);
@@ -360,11 +360,11 @@
   }
 }
 
-static int find_fp_qindex() {
+static int find_fp_qindex(vpx_bit_depth_t bit_depth) {
   int i;
 
   for (i = 0; i < QINDEX_RANGE; ++i)
-    if (vp9_convert_qindex_to_q(i) >= 30.0)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= 30.0)
       break;
 
   if (i == QINDEX_RANGE)
@@ -434,7 +434,7 @@
   vp9_clear_system_state();
 
   set_first_pass_params(cpi);
-  vp9_set_quantizer(cm, find_fp_qindex());
+  vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
 
   if (lc != NULL) {
     twopass = &lc->twopass;
@@ -935,12 +935,13 @@
                                      double err_divisor,
                                      double pt_low,
                                      double pt_high,
-                                     int q) {
+                                     int q,
+                                     vpx_bit_depth_t bit_depth) {
   const double error_term = err_per_mb / err_divisor;
 
   // Adjustment based on actual quantizer to power term.
-  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low,
-                                pt_high);
+  const double power_term =
+      MIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.0125 + pt_low, pt_high);
 
   // Calculate correction factor.
   if (power_term < 1.0)
@@ -975,9 +976,11 @@
       const double factor =
           calc_correction_factor(err_per_mb, ERR_DIVISOR,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
-                                 FACTOR_PT_LOW, FACTOR_PT_HIGH, q);
+                                 FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
+                                 cpi->common.bit_depth);
       const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q,
-                                                 factor * speed_term);
+                                                 factor * speed_term,
+                                                 cpi->common.bit_depth);
       if (bits_per_mb <= target_norm_bits_per_mb)
         break;
     }
@@ -1594,7 +1597,8 @@
    // At high Q when there are few bits to spare we are better with a longer
    // interval to spread the cost of the GF.
    active_max_gf_interval =
-     12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5);
+     12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME],
+                                        cpi->common.bit_depth) >> 5);
 
    if (active_max_gf_interval > rc->max_gf_interval)
      active_max_gf_interval = rc->max_gf_interval;
@@ -1736,7 +1740,8 @@
   // Calculate the extra bits to be used for boosted frame(s)
   {
     int q = rc->last_q[INTER_FRAME];
-    int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
+    int boost =
+        (rc->gfu_boost * gfboost_qadjust(q, cpi->common.bit_depth)) / 100;
 
     // Set max and minimum boost and hence minimum allocation.
     boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
@@ -2227,7 +2232,7 @@
                                                 section_target_bandwidth);
     twopass->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
-    rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
+    rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 5557d7f..2fc05e7 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -142,7 +142,7 @@
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = get_max_filter_level(cpi);
-    const int q = vp9_ac_quant(cm->base_qindex, 0);
+    const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
     // These values were determined by linear fitting the result of the
     // searched level, filt_guess = q * 0.316206 + 3.87252
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index a97d778..2edd52b 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -440,7 +440,7 @@
   unsigned int sse_y = UINT_MAX;
 
   const int intra_cost_penalty =
-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
+      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
   const int intra_mode_cost = 50;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index d49eb95..2f225d7 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -40,6 +40,31 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+                          const int16_t *round_ptr, const int16_t quant,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+
+  if (!skip_block) {
+    const int rc = 0;
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    const int64_t tmp =
+        (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
+         quant) >> 16;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
 void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -62,6 +87,31 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                                const int16_t *round_ptr, const int16_t quant,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+
+  if (!skip_block) {
+    const int rc = 0;
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    const int64_t tmp =
+        (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
+         quant) >> 15;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -103,6 +153,51 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  int i;
+  int eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      const int64_t tmp =
+          (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
+           quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -146,6 +241,51 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, int skip_block,
+                                  const int16_t *zbin_ptr,
+                                  const int16_t *round_ptr,
+                                  const int16_t *quant_ptr,
+                                  const int16_t *quant_shift_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t *dequant_ptr,
+                                  int zbin_oq_value, uint16_t *eob_ptr,
+                                  const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int64_t tmp = 0;
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                    INT32_MIN, INT32_MAX);
+        tmp = (tmp * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      }
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -199,6 +339,62 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t *zbin_ptr,
+                           const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *quant_shift_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t *dequant_ptr, int zbin_oq_value,
+                           uint16_t *eob_ptr, const int16_t *scan,
+                           const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
+                         zbin_ptr[1] + zbin_oq_value };
+  const int nzbins[2] = { zbins[0] * -1,
+                          zbins[1] * -1 };
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0],
+                            INT32_MIN, INT32_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (tmp)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
 void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
                             const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -255,12 +451,84 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                 intptr_t n_coeffs, int skip_block,
+                                 const int16_t *zbin_ptr,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 const int16_t *quant_shift_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr,
+                                 int zbin_oq_value, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int64_t tmp = clamp(abs_coeff +
+                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                          INT32_MIN, INT32_MAX);
+      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_high_quantize_b(BLOCK_OFFSET(p->coeff, block),
+                        16, x->skip_block,
+                        p->zbin, p->round, p->quant, p->quant_shift,
+                        BLOCK_OFFSET(p->qcoeff, block),
+                        BLOCK_OFFSET(pd->dqcoeff, block),
+                        pd->dequant, p->zbin_extra, &p->eobs[block],
+                        scan, iscan);
+    return;
+  }
+#endif
   vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
            16, x->skip_block,
            p->zbin, p->round, p->quant, p->quant_shift,
@@ -281,9 +549,23 @@
 }
 
 static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
-  int quant = vp9_dc_quant(q, 0);
+  const int quant = vp9_dc_quant(q, 0, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case VPX_BITS_10:
+      return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case VPX_BITS_12:
+      return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
   (void) bit_depth;
   return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+#endif
 }
 
 void vp9_init_quantizer(VP9_COMP *cpi) {
@@ -301,8 +583,8 @@
         qrounding_factor_fp = 64;
 
       // y
-      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
-                     : vp9_ac_quant(q, 0);
+      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
+                     : vp9_ac_quant(q, 0, cm->bit_depth);
       invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
       quants->y_quant_fp[q][i] = (1 << 16) / quant;
       quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
@@ -311,8 +593,8 @@
       cm->y_dequant[q][i] = quant;
 
       // uv
-      quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
-                     : vp9_ac_quant(q, cm->uv_ac_delta_q);
+      quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
+                     : vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
       invert_quant(&quants->uv_quant[q][i],
                    &quants->uv_quant_shift[q][i], quant);
       quants->uv_quant_fp[q][i] = (1 << 16) / quant;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index b607c85..94c0b64 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -42,13 +42,56 @@
 
 #define FRAME_OVERHEAD_BITS 200
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do { \
+    switch (bit_depth) { \
+      case VPX_BITS_8: \
+        name = name##_8; \
+        break; \
+      case VPX_BITS_10: \
+        name = name##_10; \
+        break; \
+      case VPX_BITS_12: \
+        name = name##_12; \
+        break; \
+      default: \
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
+                    " or VPX_BITS_12"); \
+        name = NULL; \
+    } \
+  } while (0)
+#else
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do { \
+    (void) bit_depth; \
+    name = name##_8; \
+  } while (0)
+#endif
+
 // Tables relating active max Q to active min Q
-static int kf_low_motion_minq[QINDEX_RANGE];
-static int kf_high_motion_minq[QINDEX_RANGE];
-static int arfgf_low_motion_minq[QINDEX_RANGE];
-static int arfgf_high_motion_minq[QINDEX_RANGE];
-static int inter_minq[QINDEX_RANGE];
-static int rtc_minq[QINDEX_RANGE];
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+#endif
+
 static int gf_high = 2000;
 static int gf_low = 400;
 static int kf_high = 5000;
@@ -58,7 +101,8 @@
 // formulaic approach to facilitate easier adjustment of the Q tables.
 // The formulae were derived from computing a 3rd order polynomial best
 // fit to the original data (after plotting real maxq vs minq (not q index))
-static int get_minq_index(double maxq, double x3, double x2, double x1) {
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+                          vpx_bit_depth_t bit_depth) {
   int i;
   const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq,
                                 maxq);
@@ -68,38 +112,69 @@
   if (minqtarget <= 2.0)
     return 0;
 
-  for (i = 0; i < QINDEX_RANGE; i++)
-    if (minqtarget <= vp9_convert_qindex_to_q(i))
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (minqtarget <= vp9_convert_qindex_to_q(i, bit_depth))
       return i;
+  }
 
   return QINDEX_RANGE - 1;
 }
 
-void vp9_rc_init_minq_luts() {
+static void init_minq_luts(int *kf_low_m, int *kf_high_m,
+                           int *arfgf_low, int *arfgf_high,
+                           int *inter, int *rtc, vpx_bit_depth_t bit_depth) {
   int i;
-
   for (i = 0; i < QINDEX_RANGE; i++) {
-    const double maxq = vp9_convert_qindex_to_q(i);
-    kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125);
-    kf_high_motion_minq[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50);
-    arfgf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30);
-    arfgf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50);
-    inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90);
-    rtc_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70);
+    const double maxq = vp9_convert_qindex_to_q(i, bit_depth);
+    kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50, bit_depth);
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
   }
 }
 
+void vp9_rc_init_minq_luts() {
+  init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+                 arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+                 inter_minq_8, rtc_minq_8, VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+  init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+                 arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+                 inter_minq_10, rtc_minq_10, VPX_BITS_10);
+  init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+                 arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+                 inter_minq_12, rtc_minq_12, VPX_BITS_12);
+#endif
+}
+
 // These functions use formulaic calculations to make playing with the
 // quantizer tables easier. If necessary they can be replaced by lookup
 // tables if and when things settle down in the experimental bitstream
-double vp9_convert_qindex_to_q(int qindex) {
+double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
   // Convert the index to a real Q value (scaled down to match old Q values)
-  return vp9_ac_quant(qindex, 0) / 4.0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
+    case VPX_BITS_10:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 16.0;
+    case VPX_BITS_12:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1.0;
+  }
+#else
+  return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
+#endif
 }
 
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor) {
-  const double q = vp9_convert_qindex_to_q(qindex);
+                       double correction_factor,
+                       vpx_bit_depth_t bit_depth) {
+  const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
   int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
 
   // q based adjustment to baseline enumerator
@@ -108,8 +183,10 @@
 }
 
 static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
-                              double correction_factor) {
-  const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor));
+                              double correction_factor,
+                              vpx_bit_depth_t bit_depth) {
+  const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor,
+                                           bit_depth));
   return ((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS;
 }
 
@@ -227,7 +304,7 @@
   rc->ni_frames = 0;
 
   rc->tot_q = 0.0;
-  rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q);
+  rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
 
   for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
     rc->rate_correction_factors[i] = 1.0;
@@ -330,7 +407,8 @@
   // Stay in double to avoid int overflow when values are large
   projected_size_based_on_q = estimate_bits_at_q(cm->frame_type,
                                                  cm->base_qindex, cm->MBs,
-                                                 rate_correction_factor);
+                                                 rate_correction_factor,
+                                                 cm->bit_depth);
   // Work out a size correction factor.
   if (projected_size_based_on_q > 0)
     correction_factor = (100 * cpi->rc.projected_frame_size) /
@@ -392,7 +470,8 @@
 
   do {
     const int bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
-                                                             correction_factor);
+                                                              correction_factor,
+                                                              cm->bit_depth);
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -424,12 +503,22 @@
   }
 }
 
-static int get_kf_active_quality(const RATE_CONTROL *const rc, int q) {
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 vpx_bit_depth_t bit_depth) {
+  int *kf_low_motion_minq;
+  int *kf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
   return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
                             kf_low_motion_minq, kf_high_motion_minq);
 }
 
-static int get_gf_active_quality(const RATE_CONTROL *const rc, int q) {
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 vpx_bit_depth_t bit_depth) {
+  int *arfgf_low_motion_minq;
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
   return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
@@ -516,6 +605,8 @@
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
   int q;
+  int *rtc_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
 
   if (frame_is_intra_only(cm)) {
     active_best_quality = rc->best_quality;
@@ -524,9 +615,10 @@
     // based on the ambient Q to reduce the risk of popping.
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
-      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                            (last_boosted_q * 0.75));
+                                            (last_boosted_q * 0.75),
+                                            cm->bit_depth);
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else if (cm->current_video_frame > 0) {
       // not first frame of one pass and kf_boost is set
@@ -534,7 +626,8 @@
       double q_val;
 
       active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]);
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME],
+                                cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -543,9 +636,10 @@
 
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
       active_best_quality += vp9_compute_qdelta(rc, q_val,
-                                                q_val * q_adj_factor);
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              !cpi->use_svc &&
@@ -559,7 +653,7 @@
     } else {
       q = active_worst_quality;
     }
-    active_best_quality = get_gf_active_quality(rc, q);
+    active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
     if (cm->current_video_frame > 1) {
@@ -592,7 +686,8 @@
     int qdelta = 0;
     vp9_clear_system_state();
     qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                        active_worst_quality, 2.0);
+                                        active_worst_quality, 2.0,
+                                        cm->bit_depth);
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
   }
@@ -644,6 +739,8 @@
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
 
@@ -652,9 +749,10 @@
     // based on the ambient Q to reduce the risk of popping.
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
-      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                            last_boosted_q * 0.75);
+                                            last_boosted_q * 0.75,
+                                            cm->bit_depth);
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // not first frame of one pass and kf_boost is set
@@ -662,7 +760,8 @@
       double q_val;
 
       active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]);
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME],
+                                cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -671,9 +770,10 @@
 
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
       active_best_quality += vp9_compute_qdelta(rc, q_val,
-                                                q_val * q_adj_factor);
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -691,7 +791,7 @@
       if (q < cq_level)
         q = cq_level;
 
-      active_best_quality = get_gf_active_quality(rc, q);
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
 
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
@@ -700,10 +800,10 @@
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-        active_best_quality = get_gf_active_quality(rc, q);
+        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
       }
     } else {
-      active_best_quality = get_gf_active_quality(rc, q);
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
@@ -742,11 +842,13 @@
         !rc->this_key_frame_forced &&
         !(cm->current_video_frame == 0)) {
       qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                          active_worst_quality, 2.0);
+                                          active_worst_quality, 2.0,
+                                          cm->bit_depth);
     } else if (!rc->is_src_frame_alt_ref &&
                (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
       qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                          active_worst_quality, 1.75);
+                                          active_worst_quality, 1.75,
+                                          cm->bit_depth);
     }
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
@@ -788,6 +890,8 @@
   int active_best_quality;
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
     // Handle the special case for key frames forced when we have75 reached
@@ -795,16 +899,18 @@
     // based on the ambient Q to reduce the risk of popping.
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
-      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                            last_boosted_q * 0.75);
+                                            last_boosted_q * 0.75,
+                                            cm->bit_depth);
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // Not forced keyframe.
       double q_adj_factor = 1.0;
       double q_val;
       // Baseline value derived from cpi->active_worst_quality and kf boost.
-      active_best_quality = get_kf_active_quality(rc, active_worst_quality);
+      active_best_quality = get_kf_active_quality(rc, active_worst_quality,
+                                                  cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -816,9 +922,10 @@
 
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
       active_best_quality += vp9_compute_qdelta(rc, q_val,
-                                                q_val * q_adj_factor);
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -836,7 +943,7 @@
       if (q < cq_level)
         q = cq_level;
 
-      active_best_quality = get_gf_active_quality(rc, q);
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
 
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
@@ -845,10 +952,10 @@
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-        active_best_quality = get_gf_active_quality(rc, q);
+        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
       }
     } else {
-      active_best_quality = get_gf_active_quality(rc, q);
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
@@ -888,7 +995,8 @@
     const double rate_factor =
       rate_factor_deltas[gf_group->rf_level[gf_group->index]];
     int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                            active_worst_quality, rate_factor);
+                                            active_worst_quality, rate_factor,
+                                            cm->bit_depth);
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
   }
@@ -1038,7 +1146,7 @@
       rc->avg_frame_qindex[INTER_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
       rc->ni_frames++;
-      rc->tot_q += vp9_convert_qindex_to_q(qindex);
+      rc->tot_q += vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       rc->avg_q = rc->tot_q / rc->ni_frames;
       // Calculate the average Q for normal inter frames (not key or GFU
       // frames).
@@ -1294,7 +1402,8 @@
   rc->baseline_gf_interval = INT_MAX;
 }
 
-int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget) {
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       vpx_bit_depth_t bit_depth) {
   int start_index = rc->worst_quality;
   int target_index = rc->worst_quality;
   int i;
@@ -1302,14 +1411,14 @@
   // Convert the average q value to an index.
   for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     start_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qstart)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= qstart)
       break;
   }
 
   // Convert the q target to an index
   for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     target_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qtarget)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= qtarget)
       break;
   }
 
@@ -1317,12 +1426,14 @@
 }
 
 int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
-                               int qindex, double rate_target_ratio) {
+                               int qindex, double rate_target_ratio,
+                               vpx_bit_depth_t bit_depth) {
   int target_index = rc->worst_quality;
   int i;
 
   // Look up the current projected bits per block for the base index
-  const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0);
+  const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0,
+                                                  bit_depth);
 
   // Find the target bits per mb based on the base value and given ratio.
   const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
@@ -1330,7 +1441,7 @@
   // Convert the q target to an index
   for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     target_index = i;
-    if (vp9_rc_bits_per_mb(frame_type, i, 1.0) <= target_bits_per_mb )
+    if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <= target_bits_per_mb)
       break;
   }
 
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 456daf4..2ced8e6 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -12,6 +12,7 @@
 #ifndef VP9_ENCODER_VP9_RATECTRL_H_
 #define VP9_ENCODER_VP9_RATECTRL_H_
 
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 
 #include "vp9/common/vp9_blockd.h"
@@ -104,7 +105,7 @@
 void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
                  RATE_CONTROL *rc);
 
-double vp9_convert_qindex_to_q(int qindex);
+double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
 
 void vp9_rc_init_minq_luts();
 
@@ -167,7 +168,7 @@
 
 // Estimates bits per mb for a given qindex and correction factor.
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor);
+                       double correction_factor, vpx_bit_depth_t bit_depth);
 
 // Clamping utilities for bitrate targets for iframes and pframes.
 int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi,
@@ -180,12 +181,14 @@
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a target q value
-int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget);
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       vpx_bit_depth_t bit_depth);
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a value that should equate to the given rate ratio.
 int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
-                               int qindex, double rate_target_ratio);
+                               int qindex, double rate_target_ratio,
+                               vpx_bit_depth_t bit_depth);
 
 void vp9_rc_update_framerate(struct VP9_COMP *cpi);
 
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index b45e07d..8b7066b 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -93,34 +93,69 @@
 }
 
 // Values are now correlated to quantizer.
-static int sad_per_bit16lut[QINDEX_RANGE];
-static int sad_per_bit4lut[QINDEX_RANGE];
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
 
-void vp9_init_me_luts() {
+#if CONFIG_VP9_HIGHBITDEPTH
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+#endif
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+                            vpx_bit_depth_t bit_depth) {
   int i;
-
   // Initialize the sad lut tables using a formulaic calculation for now.
   // This is to make it easier to resolve the impact of experimental changes
   // to the quantizer tables.
-  for (i = 0; i < QINDEX_RANGE; ++i) {
-    const double q = vp9_convert_qindex_to_q(i);
-    sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
-    sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
+  for (i = 0; i < range; i++) {
+    const double q = vp9_convert_qindex_to_q(i, bit_depth);
+    bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    bit4lut[i] = (int)(0.063 * q + 2.742);
   }
 }
 
+void vp9_init_me_luts() {
+  init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+                  VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+  init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+                  VPX_BITS_10);
+  init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+                  VPX_BITS_12);
+#endif
+}
+
 static const int rd_boost_factor[16] = {
   64, 32, 32, 32, 24, 16, 12, 12,
   8, 8, 4, 4, 2, 2, 1, 0
 };
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
-128, 144, 128, 128, 144
+  128, 144, 128, 128, 144
 };
 
 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
-  const int q = vp9_dc_quant(qindex, 0);
+  const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  int rdmult = 0;
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_8:
+      rdmult = 88 * q * q / 24;
+      break;
+    case VPX_BITS_10:
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4);
+      break;
+    case VPX_BITS_12:
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8);
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
   int rdmult = 88 * q * q / 24;
-
+#endif
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
@@ -132,15 +167,53 @@
   return rdmult;
 }
 
-static int compute_rd_thresh_factor(int qindex) {
+static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
+  double q;
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+      break;
+    case VPX_BITS_10:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0;
+      break;
+    case VPX_BITS_12:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+#endif
   // TODO(debargha): Adjust the function below.
-  const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
-  return MAX(q, 8);
+  return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
-  cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
-  cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_8:
+      cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
+      cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
+      break;
+    case VPX_BITS_10:
+      cpi->mb.sadperbit16 = sad_per_bit16lut_10[qindex];
+      cpi->mb.sadperbit4 = sad_per_bit4lut_10[qindex];
+      break;
+    case VPX_BITS_12:
+      cpi->mb.sadperbit16 = sad_per_bit16lut_12[qindex];
+      cpi->mb.sadperbit4 = sad_per_bit4lut_12[qindex];
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+  }
+#else
+  cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
+  cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
+#endif
 }
 
 static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
@@ -149,9 +222,8 @@
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
     const int qindex =
         clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
-                  cm->y_dc_delta_q,
-              0, MAXQ);
-    const int q = compute_rd_thresh_factor(qindex);
+              cm->y_dc_delta_q, 0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
 
     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
       // Threshold here seems unnecessarily harsh but fine given actual
@@ -457,9 +529,15 @@
   for (i = 0; i < MAX_MODES; ++i)
     rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
 
-  rd->thresh_mult[THR_NEARESTMV] = 0;
-  rd->thresh_mult[THR_NEARESTG] = 0;
-  rd->thresh_mult[THR_NEARESTA] = 0;
+  if (sf->adaptive_rd_thresh) {
+    rd->thresh_mult[THR_NEARESTMV] = 300;
+    rd->thresh_mult[THR_NEARESTG] = 300;
+    rd->thresh_mult[THR_NEARESTA] = 300;
+  } else {
+    rd->thresh_mult[THR_NEARESTMV] = 0;
+    rd->thresh_mult[THR_NEARESTG] = 0;
+    rd->thresh_mult[THR_NEARESTA] = 0;
+  }
 
   rd->thresh_mult[THR_DC] += 1000;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 7be557d..bf27ba6 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2582,7 +2582,8 @@
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
-  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
+  const int intra_cost_penalty =
+      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
@@ -3312,7 +3313,8 @@
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
-  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
+  const int intra_cost_penalty =
+      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 18a6a91..ff02666 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -200,8 +200,8 @@
   int frame;
   int mb_col, mb_row;
   unsigned int filter_weight;
-  int mb_cols = cpi->common.mb_cols;
-  int mb_rows = cpi->common.mb_rows;
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
   int mb_y_offset = 0;
   int mb_uv_offset = 0;
   DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
@@ -233,7 +233,7 @@
     // To keep the mv in play for both Y and UV planes the max that it
     //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
     cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-    cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+    cpi->mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
                          + (17 - 2 * VP9_INTERP_EXTEND);
 
     for (mb_col = 0; mb_col < mb_cols; mb_col++) {
@@ -244,7 +244,7 @@
       vpx_memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
 
       cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-      cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+      cpi->mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
                            + (17 - 2 * VP9_INTERP_EXTEND);
 
       for (frame = 0; frame < frame_count; frame++) {
@@ -389,10 +389,10 @@
   // Adjust the strength based on active max q.
   if (cpi->common.current_video_frame > 1)
     q = ((int)vp9_convert_qindex_to_q(
-        cpi->rc.avg_frame_qindex[INTER_FRAME]));
+        cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth));
   else
     q = ((int)vp9_convert_qindex_to_q(
-        cpi->rc.avg_frame_qindex[KEY_FRAME]));
+        cpi->rc.avg_frame_qindex[KEY_FRAME], cpi->common.bit_depth));
   if (q > 16) {
     strength = oxcf->arnr_strength;
   } else {
@@ -480,10 +480,12 @@
       }
     }
   } else {
+    // ARF is produced at the native frame size and resized when coded.
     vp9_setup_scale_factors_for_frame(&sf,
-                                      get_frame_new_buffer(cm)->y_crop_width,
-                                      get_frame_new_buffer(cm)->y_crop_height,
-                                      cm->width, cm->height);
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height);
   }
 
   temporal_filter_iterate_c(cpi, frames, frames_to_blur,
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 90f0342..e88060c 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -89,6 +89,10 @@
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 endif
 
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm
+endif
+
 # common (c)
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h
index e69df4b..41038b1 100644
--- a/vpx/vpx_frame_buffer.h
+++ b/vpx/vpx_frame_buffer.h
@@ -43,15 +43,15 @@
  *
  * This callback is invoked by the decoder to retrieve data for the frame
  * buffer in order for the decode call to complete. The callback must
- * allocate at least min_size in bytes and assign it to fb->data. Then the
- * callback must set fb->size to the allocated size. The application does not
- * need to align the allocated data. The callback is triggered when the
- * decoder needs a frame buffer to decode a compressed image into. This
- * function may be called more than once for every call to vpx_codec_decode.
- * The application may set fb->priv to some data which will be passed
- * back in the ximage and the release function call. |fb| is guaranteed to
- * not be NULL. On success the callback must return 0. Any failure the
- * callback must return a value less than 0.
+ * allocate at least min_size in bytes and assign it to fb->data. The callback
+ * must zero out all the data allocated. Then the callback must set fb->size
+ * to the allocated size. The application does not need to align the allocated
+ * data. The callback is triggered when the decoder needs a frame buffer to
+ * decode a compressed image into. This function may be called more than once
+ * for every call to vpx_codec_decode. The application may set fb->priv to
+ * some data which will be passed back in the ximage and the release function
+ * call. |fb| is guaranteed to not be NULL. On success the callback must
+ * return 0. Any failure the callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
  * \param[in] new_size     Size in bytes needed by the buffer
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 70d7ac0..475d231 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -199,11 +199,6 @@
       if (fb->data == NULL || fb->size < external_frame_size)
         return -1;
 
-      // This memset is needed for fixing valgrind error from C loop filter
-      // due to access uninitialized memory in frame border. It could be
-      // removed if border is totally removed.
-      vpx_memset(fb->data, 0, fb->size);
-
       ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32);
     } else if (frame_size > (size_t)ybf->buffer_alloc_sz) {
       // Allocation to hold larger frame, or first allocation.
diff --git a/vpxdec.c b/vpxdec.c
index 091522f..cf23c29 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -384,7 +384,7 @@
 
   if (ext_fb_list->ext_fb[i].size < min_size) {
     free(ext_fb_list->ext_fb[i].data);
-    ext_fb_list->ext_fb[i].data = (uint8_t *)malloc(min_size);
+    ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t));
     if (!ext_fb_list->ext_fb[i].data)
       return -1;