Merge "Fix inconsistency in gm parameter write to bitstream" into nextgenv2
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 0d0dc94..2adbef1 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -376,4 +376,6 @@
 DSP_SRCS-yes += aom_dsp_rtcd.c
 DSP_SRCS-yes += aom_dsp_rtcd_defs.pl
 
+DSP_SRCS-yes += aom_simd.c
+
 $(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a2b9a75..ba4b40f 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -587,11 +587,19 @@
 specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
 
 if (aom_config("CONFIG_CLPF") eq "yes") {
-  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
+    specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+    add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size";
+    specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
+    add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size";
+    specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
+  }
+  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
   specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
-  add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
+  add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size";
   specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
-  add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum";
+  add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size";
   specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/;
 }
 
diff --git a/aom_dsp/aom_simd.c b/aom_dsp/aom_simd.c
new file mode 100644
index 0000000..03f4ba9
--- /dev/null
+++ b/aom_dsp/aom_simd.c
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Set to 1 to add some sanity checks in the fallback C code
+const int simd_check = 1;
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index 611949a..d062e07 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -90,8 +90,8 @@
   return literal;
 }
 
-static INLINE int aom_read_tree(aom_reader *r, const aom_tree_index *tree,
-                                const aom_prob *probs) {
+static INLINE int aom_read_tree_bits(aom_reader *r, const aom_tree_index *tree,
+                                     const aom_prob *probs) {
   aom_tree_index i = 0;
 
   while ((i = tree[i + aom_read(r, probs[i >> 1])]) > 0) continue;
@@ -99,6 +99,11 @@
   return -i;
 }
 
+static INLINE int aom_read_tree(aom_reader *r, const aom_tree_index *tree,
+                                const aom_prob *probs) {
+  return aom_read_tree_bits(r, tree, probs);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index d3e4ae9..5e34fd6 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -70,6 +70,22 @@
   for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
 }
 
+static INLINE void aom_write_tree_bits(aom_writer *w, const aom_tree_index *tr,
+                                       const aom_prob *probs, int bits, int len,
+                                       aom_tree_index i) {
+  do {
+    const int bit = (bits >> --len) & 1;
+    aom_write(w, bit, probs[i >> 1]);
+    i = tr[i + bit];
+  } while (len);
+}
+
+static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
+                                  const aom_prob *probs, int bits, int len,
+                                  aom_tree_index i) {
+  aom_write_tree_bits(w, tree, probs, bits, len, i);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index 13d1314..73549b8 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -28,7 +28,7 @@
 SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
 
 SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return vcombine_s64(b, a);
+  return vcombine_s64((uint64x1_t)b, (uint64x1_t)a);
 }
 
 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
@@ -52,7 +52,9 @@
 }
 
 SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
-#if __OPTIMIZE__
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
   return c ? vreinterpretq_s64_s8(
                  vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
            : b;
@@ -122,7 +124,7 @@
 
 SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
   ssd128_internal s;
-  s.hi = s.lo = 0;
+  s.hi = s.lo = (ssd64_internal)(uint64_t)0;
   return s;
 }
 
@@ -430,11 +432,11 @@
 
 SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
   return v128_from_64(
-      vreinterpret_s64_u8(
+      (uint64_t)vreinterpret_s64_u8(
           vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
                                     vget_high_u8(vreinterpretq_u8_s64(x)) } },
                    vreinterpret_u8_s64(vget_high_s64(pattern)))),
-      vreinterpret_s64_u8(
+      (uint64_t)vreinterpret_s64_u8(
           vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
                                     vget_high_u8(vreinterpretq_u8_s64(x)) } },
                    vreinterpret_u8_s64(vget_low_s64(pattern)))));
@@ -521,21 +523,24 @@
                         vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
 }
 
-#if __OPTIMIZE__
+#if __OPTIMIZE__ && !__clang__
 
 SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
   return n < 8
              ? v128_from_64(
-                   vorr_u64(vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                       n * 8),
-                            vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                       (8 - n) * 8)),
-                   vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8))
-             : (n == 8 ? v128_from_64(vreinterpret_u64_s64(vget_low_s64(a)), 0)
-                       : v128_from_64(
-                             vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                        (n - 8) * 8),
-                             0));
+                   (uint64_t)vorr_u64(
+                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                  n * 8),
+                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                  (8 - n) * 8)),
+                   (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                        n * 8))
+             : (n == 8 ? v128_from_64(
+                             (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
+                       : v128_from_64((uint64_t)vshl_n_u64(
+                                          vreinterpret_u64_s64(vget_low_s64(a)),
+                                          (n - 8) * 8),
+                                      0));
 }
 
 SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
diff --git a/aom_dsp/simd/v128_intrinsics_c.h b/aom_dsp/simd/v128_intrinsics_c.h
index 561ac86..34e312e 100644
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -15,6 +15,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "./v64_intrinsics_c.h"
+#include "./aom_config.h"
 
 typedef union {
   uint8_t u8[16];
@@ -406,11 +407,13 @@
 }
 
 SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
-  return big_endian() ? _c_v128_unzip_8(a, b, 1) : _c_v128_unzip_8(a, b, 0);
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
+                           : _c_v128_unzip_8(a, b, 0);
 }
 
 SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
-  return big_endian() ? _c_v128_unzip_8(b, a, 0) : _c_v128_unzip_8(b, a, 1);
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
+                           : _c_v128_unzip_8(b, a, 1);
 }
 
 SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
@@ -438,11 +441,13 @@
 }
 
 SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
-  return big_endian() ? _c_v128_unzip_16(a, b, 1) : _c_v128_unzip_16(a, b, 0);
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
+                           : _c_v128_unzip_16(a, b, 0);
 }
 
 SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
-  return big_endian() ? _c_v128_unzip_16(b, a, 0) : _c_v128_unzip_16(b, a, 1);
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
+                           : _c_v128_unzip_16(b, a, 1);
 }
 
 SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
@@ -462,11 +467,13 @@
 }
 
 SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
-  return big_endian() ? _c_v128_unzip_32(a, b, 1) : _c_v128_unzip_32(a, b, 0);
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
+                           : _c_v128_unzip_32(a, b, 0);
 }
 
 SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
-  return big_endian() ? _c_v128_unzip_32(b, a, 0) : _c_v128_unzip_32(b, a, 1);
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
+                           : _c_v128_unzip_32(b, a, 1);
 }
 
 SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
@@ -535,8 +542,8 @@
               c);
       abort();
     }
-    t.u8[c] =
-        a.u8[big_endian() ? 15 - (pattern.u8[c] & 15) : pattern.u8[c] & 15];
+    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
+                                     : pattern.u8[c] & 15];
   }
   return t;
 }
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index e09cbb9..4504996 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -58,7 +58,9 @@
   _mm_storeu_si128((__m128i *)p, a);
 }
 
-#if defined(__OPTIMIZE__)
+// The following function requires an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
 #if defined(__SSSE3__)
 SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
   return c ? _mm_alignr_epi8(a, b, c) : b;
@@ -418,26 +420,19 @@
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c);
-  return _mm_packus_epi16(
-      _mm_srli_epi16(
-          _mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x), 8),
-      _mm_srli_epi16(
-          _mm_sll_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x), 8));
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
-  return _mm_packus_epi16(
-      _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x),
-      _mm_srl_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x));
+  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
   __m128i x = _mm_cvtsi32_si128(c + 8);
-  return _mm_packs_epi16(
-      _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x),
-      _mm_sra_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x));
+  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
+                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
 }
 
 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
@@ -468,20 +463,13 @@
    to enforce that. */
 #define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
 #define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
-#define v128_shl_n_8(a, c)                                                  \
-  _mm_packus_epi16(                                                         \
-      _mm_srli_epi16(                                                       \
-          _mm_slli_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c), 8), \
-      _mm_srli_epi16(                                                       \
-          _mm_slli_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c), 8))
-#define v128_shr_n_u8(a, c)                                             \
-  _mm_packus_epi16(                                                     \
-      _mm_srli_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c + 8), \
-      _mm_srli_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c + 8))
-#define v128_shr_n_s8(a, c)                                             \
-  _mm_packs_epi16(                                                      \
-      _mm_srai_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c + 8), \
-      _mm_srai_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c + 8))
+#define v128_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v128_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v128_shr_n_s8(a, c)                                         \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
+                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
 #define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
 #define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
 #define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index b487303..bf92167 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -14,12 +14,10 @@
 
 #include <arm_neon.h>
 #include "./v64_intrinsics_arm.h"
+#include "aom_ports/arm.h"
 
-/* vzip in gcc is broken.  Fixed in 4.6.1? */
-#if __GNUC__ &&                                                       \
-    ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ < \
-     (4 << 16) + (6 << 8) + 1)
-#error vzip buggy in gcc.  Get at least gcc 4.6.1.
+#ifdef AOM_INCOMPATIBLE_GCC
+#error Incompatible gcc
 #endif
 
 typedef int64x1_t v64;
@@ -51,7 +49,7 @@
 
 SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
 
-SIMD_INLINE uint64_t v64_u64(v64 x) { return x; }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
 
 SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
   return *((uint32_t *)p);
@@ -66,12 +64,16 @@
 }
 
 SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-#if __CC_ARM
+#if __clang__
+  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+                0);
+#elif __CC_ARM
   *(__packed uint32_t *)p) = a;
 #elif __GNUC__
   *((__attribute((packed)) uint32_t *)p) = a;
 #else
-  vst1_lane_u32((uint32_t*)p, vreinterpret_u32_s64(a), 0);
+  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+                0);
 #endif
 }
 
@@ -91,13 +93,16 @@
   vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
 }
 
+// The following function requires an immediate.
+// Some compilers will check this if it's optimising, others wont.
 SIMD_INLINE v64 v64_align(v64 a, v64 b, const unsigned int c) {
-#if __OPTIMIZE__
+#if __OPTIMIZE__ && !__clang__
   return c ? vreinterpret_s64_s8(
                  vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
            : b;
 #else
-  return c ? v64_from_64(b >> c * 8) | (a << (8 - c) * 8) : b;
+  return c ? v64_from_64((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)
+           : b;
 #endif
 }
 
@@ -121,21 +126,21 @@
   int64x2_t r = vpaddlq_s32(vpaddlq_s16(
       vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
                 vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
-  return vadd_s64(vget_high_s64(r), vget_low_s64(r));
+  return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
 }
 
 SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
   int64x2_t r =
       vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-  return vget_high_s64(r) + vget_low_s64(r);
+  return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
 }
 
 SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
-  return vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+  return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
 }
 
 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
-  return vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
+  return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
 }
 
 typedef uint16x8_t sad64_internal;
@@ -151,12 +156,14 @@
 
 SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
   uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
-  return (uint32_t)(vget_high_u64(r) + vget_low_u64(r));
+  return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
 }
 
 typedef int64x1_t ssd64_internal;
 
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return 0; }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
+  return (ssd64_internal)(uint64_t)0;
+}
 
 /* Implementation dependent return value.  Result must be finalised with
  * v64_ssd_u8_sum(). */
@@ -166,7 +173,9 @@
   return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
 }
 
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { return (uint32_t)s; }
+SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+  return (uint32_t)(uint64_t)s;
+}
 
 SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
 
@@ -470,7 +479,9 @@
       vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
 }
 
-#if __OPTIMIZE__
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
 
 SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int c) {
   return vshl_n_s64(a, c * 8);
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 502df23..b951492 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -86,9 +86,10 @@
   _mm_storel_epi64((__m128i *)p, a);
 }
 
+// The following function requires an immediate.
 #if __OPTIMIZE__
 #define v64_align(a, b, c) \
-  (c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b;
+  ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
 #else
 #define v64_align(a, b, c)                                                  \
   ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
@@ -388,25 +389,18 @@
 SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return _mm_packus_epi16(
-      _mm_srli_epi16(_mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a),
-                                   _mm_cvtsi32_si128(c)),
-                     8),
-      _mm_setzero_si128());
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
-  __m128i cp8 = _mm_cvtsi32_si128(c + 8);
-  return _mm_packus_epi16(
-      _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), cp8),
-      _mm_setzero_si128());
+  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
 SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
-  __m128i cp8 = _mm_cvtsi32_si128(c + 8);
   return _mm_packs_epi16(
-      _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), cp8),
-      _mm_setzero_si128());
+      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
 }
 
 SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
@@ -437,19 +431,12 @@
    to enforce that. */
 #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
 #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
-#define v64_shl_n_8(a, c)                                                  \
-  _mm_packus_epi16(                                                        \
-      _mm_srli_epi16(                                                      \
-          _mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c), 8), \
-      _mm_setzero_si128())
-#define v64_shr_n_u8(a, c)                                               \
-  _mm_packus_epi16(                                                      \
-      _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), (c) + 8), \
-      _mm_setzero_si128())
-#define v64_shr_n_s8(a, c)                                               \
-  _mm_packs_epi16(                                                       \
-      _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), (c) + 8), \
-      _mm_setzero_si128())
+#define v64_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v64_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v64_shr_n_s8(a, c) \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
 #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
 #define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
 #define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
diff --git a/aom_util/aom_util.mk b/aom_util/aom_util.mk
index fcf0d70..14b484a 100644
--- a/aom_util/aom_util.mk
+++ b/aom_util/aom_util.mk
@@ -9,6 +9,7 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
+
 UTIL_SRCS-yes += aom_util.mk
 UTIL_SRCS-yes += aom_thread.c
 UTIL_SRCS-yes += aom_thread.h
diff --git a/aom_util/debug_util.c b/aom_util/debug_util.c
index 0385df7..52389d0 100644
--- a/aom_util/debug_util.c
+++ b/aom_util/debug_util.c
@@ -1,17 +1,17 @@
 /*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <assert.h>
 #include <stdio.h>
 #include "aom_util/debug_util.h"
-
 #define QUEUE_MAX_SIZE 2000000
 static int result_queue[QUEUE_MAX_SIZE];
 static int prob_queue[QUEUE_MAX_SIZE];
diff --git a/aom_util/debug_util.h b/aom_util/debug_util.h
index 7c2299a..c52e385 100644
--- a/aom_util/debug_util.h
+++ b/aom_util/debug_util.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_UTIL_DEBUG_UTIL_H_
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 668c75f..1cf5272 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -8,8 +8,10 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
 #include "av1/common/clpf.h"
 #include "./aom_dsp_rtcd.h"
+#include "aom/aom_image.h"
 #include "aom_dsp/aom_dsp_common.h"
 
 int av1_clpf_maxbits(const AV1_COMMON *cm) {
@@ -27,58 +29,113 @@
   return (8 + delta - (delta < 0)) >> 4;
 }
 
-void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0,
-                      int y0, int sizex, int sizey, int width, int height,
-                      unsigned int strength) {
+void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
+                      int dstride, int x0, int y0, int sizex, int sizey,
+                      int width, int height, unsigned int strength) {
   int x, y;
   for (y = y0; y < y0 + sizey; y++) {
     for (x = x0; x < x0 + sizex; x++) {
-      int X = src[y * stride + x];
-      int A = src[AOMMAX(0, y - 1) * stride + x];
-      int B = src[y * stride + AOMMAX(0, x - 2)];
-      int C = src[y * stride + AOMMAX(0, x - 1)];
-      int D = src[y * stride + AOMMIN(width - 1, x + 1)];
-      int E = src[y * stride + AOMMIN(width - 1, x + 2)];
-      int F = src[AOMMIN(height - 1, y + 1) * stride + x];
+      int X = src[y * sstride + x];
+      int A = src[AOMMAX(0, y - 1) * sstride + x];
+      int B = src[y * sstride + AOMMAX(0, x - 2)];
+      int C = src[y * sstride + AOMMAX(0, x - 1)];
+      int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
+      int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
+      int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
       int delta;
       delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
-      dst[y * stride + x] = X + delta;
+      dst[y * dstride + x] = X + delta;
     }
   }
 }
 
+#if CONFIG_AOM_HIGHBITDEPTH
+// Identical to aom_clpf_block_c() apart from "src" and "dst".
+void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
+                          int dstride, int x0, int y0, int sizex, int sizey,
+                          int width, int height, unsigned int strength) {
+  int x, y;
+  for (y = y0; y < y0 + sizey; y++) {
+    for (x = x0; x < x0 + sizex; x++) {
+      int X = src[y * sstride + x];
+      int A = src[AOMMAX(0, y - 1) * sstride + x];
+      int B = src[y * sstride + AOMMAX(0, x - 2)];
+      int C = src[y * sstride + AOMMAX(0, x - 1)];
+      int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
+      int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
+      int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
+      int delta;
+      delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
+      dst[y * dstride + x] = X + delta;
+    }
+  }
+}
+#endif
+
 // Return number of filtered blocks
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
-                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
                    int enable_fb_flag, unsigned int strength,
-                   unsigned int fb_size_log2, uint8_t *blocks,
+                   unsigned int fb_size_log2, uint8_t *blocks, int plane,
                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
                                    const YV12_BUFFER_CONFIG *,
                                    const AV1_COMMON *cm, int, int, int,
                                    unsigned int, unsigned int, uint8_t *)) {
   /* Constrained low-pass filter (CLPF) */
   int c, k, l, m, n;
-  const int bs = MI_SIZE;
-  int width = cm->mi_cols * bs;
-  int height = cm->mi_rows * bs;
+  const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
+  const int suby = plane != AOM_PLANE_Y && frame->subsampling_y;
+  const int bs = (subx || suby) ? 4 : 8;
+  const int bslog = get_msb(bs);
+  int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width;
+  int height =
+      plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height;
   int xpos, ypos;
-  int stride_y = rec->y_stride;
-  int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
-  int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
+  const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride;
+  int dstride = bs;
+  const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
+  const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
   int block_index = 0;
+  uint8_t *cache = NULL;
+  uint8_t **cache_ptr = NULL;
+  uint8_t **cache_dst = NULL;
+  int cache_idx = 0;
+  const int cache_size = num_fb_hor << (2 * fb_size_log2);
+  const int cache_blocks = cache_size / (bs * bs);
+  uint8_t *src_buffer =
+      plane != AOM_PLANE_Y
+          ? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
+          : frame->y_buffer;
+  uint8_t *dst_buffer;
+
+// Make buffer space for in-place filtering
+#if CONFIG_AOM_HIGHBITDEPTH
+  strength <<= (cm->bit_depth - 8);
+  CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth));
+  dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
+#else
+  CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
+  dst_buffer = cache;
+#endif
+  CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr)));
+  CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst)));
+  memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
 
   // Iterate over all filter blocks
   for (k = 0; k < num_fb_ver; k++) {
     for (l = 0; l < num_fb_hor; l++) {
       int h, w;
       int allskip = 1;
+      const int xoff = l << fb_size_log2;
+      const int yoff = k << fb_size_log2;
       for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
         for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
-          xpos = (l << fb_size_log2) + n * bs;
-          ypos = (k << fb_size_log2) + m * bs;
+          xpos = xoff + n * bs;
+          ypos = yoff + m * bs;
           if (xpos < width && ypos < height) {
             allskip &=
-                cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
+                cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+                                    (xpos << subx) / MI_SIZE]
                     ->mbmi.skip;
           }
         }
@@ -91,36 +148,145 @@
       w += !w << fb_size_log2;
       if (!allskip &&  // Do not filter the block if all is skip encoded
           (!enable_fb_flag ||
-           decision(k, l, rec, org, cm, bs, w / bs, h / bs, strength,
+           decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
                     fb_size_log2, blocks + block_index))) {
         // Iterate over all smaller blocks inside the filter block
-        for (m = 0; m < (h + bs - 1) / bs; m++) {
-          for (n = 0; n < (w + bs - 1) / bs; n++) {
-            xpos = (l << fb_size_log2) + n * bs;
-            ypos = (k << fb_size_log2) + m * bs;
-            if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
-                     ->mbmi.skip) {
-              // Not skip block, apply the filter
-              aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos,
-                             bs, bs, width, height, strength);
-            } else {  // Skip block, copy instead
-              for (c = 0; c < bs; c++)
-                *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) =
-                    *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos);
+        for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
+          for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
+            int sizex, sizey;
+            xpos = xoff + n * bs;
+            ypos = yoff + m * bs;
+            sizex = AOMMIN(width - xpos, bs);
+            sizey = AOMMIN(height - ypos, bs);
+            if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+                                     (xpos << subx) / MI_SIZE]
+                     ->mbmi.skip) {  // Not skip block
+              // Temporary buffering needed if filtering in-place
+              if (cache_ptr[cache_idx]) {
+// Copy filtered block back into the frame
+#if CONFIG_AOM_HIGHBITDEPTH
+                if (cm->use_highbitdepth) {
+                  uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
+                  if (sizex == 8) {
+                    for (c = 0; c < sizey; c++) {
+                      *(uint64_t *)(d + c * sstride) =
+                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
+                      *(uint64_t *)(d + c * sstride + 4) =
+                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
+                    }
+                  } else if (sizex == 4) {
+                    for (c = 0; c < sizey; c++)
+                      *(uint64_t *)(d + c * sstride) =
+                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
+                  } else {
+                    for (c = 0; c < sizey; c++)
+                      memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2,
+                             sizex);
+                  }
+                } else {
+                  if (sizex == 8)
+                    for (c = 0; c < sizey; c++)
+                      *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+                  else if (sizex == 4)
+                    for (c = 0; c < sizey; c++)
+                      *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
+                          *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+                  else
+                    for (c = 0; c < sizey; c++)
+                      memcpy(cache_dst[cache_idx] + c * sstride,
+                             cache_ptr[cache_idx] + c * bs, sizex);
+                }
+#else
+                if (sizex == 8)
+                  for (c = 0; c < sizey; c++)
+                    *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+                else if (sizex == 4)
+                  for (c = 0; c < sizey; c++)
+                    *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
+                        *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+                else
+                  for (c = 0; c < sizey; c++)
+                    memcpy(cache_dst[cache_idx] + c * sstride,
+                           cache_ptr[cache_idx] + c * bs, sizex);
+#endif
+              }
+#if CONFIG_AOM_HIGHBITDEPTH
+              if (cm->use_highbitdepth) {
+                cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2;
+                dst_buffer =
+                    CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos;
+              } else {
+                cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
+                dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
+              }
+#else
+              cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
+              dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
+#endif
+              cache_dst[cache_idx] = src_buffer + ypos * sstride + xpos;
+              if (++cache_idx >= cache_blocks) cache_idx = 0;
+
+// Apply the filter
+#if CONFIG_AOM_HIGHBITDEPTH
+              if (cm->use_highbitdepth) {
+                aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
+                                   CONVERT_TO_SHORTPTR(dst_buffer), sstride,
+                                   dstride, xpos, ypos, sizex, sizey, width,
+                                   height, strength);
+              } else {
+                aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
+                               ypos, sizex, sizey, width, height, strength);
+              }
+#else
+              aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
+                             ypos, sizex, sizey, width, height, strength);
+#endif
             }
           }
         }
-      } else {  // Entire filter block is skip, copy
-        for (m = 0; m < h; m++)
-          memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y +
-                     (l << fb_size_log2),
-                 rec->y_buffer + ((k << fb_size_log2) + m) * stride_y +
-                     (l << fb_size_log2),
-                 w);
       }
       block_index += !allskip;  // Count number of blocks filtered
     }
   }
 
+  // Copy remaining blocks into the frame
+  for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
+       cache_idx++) {
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
+      for (c = 0; c < bs; c++) {
+        *(uint64_t *)(d + c * sstride) =
+            *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
+        if (bs == 8)
+          *(uint64_t *)(d + c * sstride + 4) =
+              *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
+      }
+    } else {
+      for (c = 0; c < bs; c++)
+        if (bs == 4)
+          *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
+              *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+        else
+          *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+              *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+    }
+#else
+    for (c = 0; c < bs; c++)
+      if (bs == 4)
+        *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
+            *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+      else
+        *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+            *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+#endif
+  }
+
+  aom_free(cache);
+  aom_free(cache_ptr);
+  aom_free(cache_dst);
+
   return block_index;
 }
diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index 21671a1..8e4213b 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@@ -17,10 +17,10 @@
 
 int av1_clpf_maxbits(const AV1_COMMON *cm);
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
-                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
                    int enable_fb_flag, unsigned int strength,
-                   unsigned int fb_size_log2, uint8_t *blocks,
+                   unsigned int fb_size_log2, uint8_t *blocks, int plane,
                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
                                    const YV12_BUFFER_CONFIG *,
                                    const AV1_COMMON *cm, int, int, int,
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index faaf8ea..6fef4b7 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -10,187 +10,350 @@
  */
 
 #include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
 
-static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
-                       int y0, int sizey, int width, int height,
-                       unsigned int strength) {
-  dst += x0 + y0 * stride;
-  src += x0 + y0 * stride;
-  {
-    int bottom = height - 2 - y0;
-    const v128 sp = v128_dup_8(strength);
-    const v128 sm = v128_dup_8(-(int)strength);
-    const v128 c8 = v128_dup_8(8);
-    const v128 c128 = v128_dup_8(128);
+// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
+//         3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
+//         1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
+SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+                            v128 f, v128 sp, v128 sm) {
+  // The difference will be 9 bit, offset by 128 so we can use saturated
+  // sub to avoid going to 16 bit temporarily before "strength" clipping.
+  const v128 c128 = v128_dup_8(128);
+  const v128 x = v128_add_8(c128, o);
+  const v128 c8 = v128_dup_8(8);
+  const v128 tmp = v128_add_8(
+      v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, c), x), sp), sm),
+      v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, d), x), sp), sm));
+  const v128 delta = v128_add_8(
+      v128_add_8(
+          v128_shl_8(
+              v128_add_8(
+                  v128_max_s8(
+                      v128_min_s8(v128_ssub_s8(v128_add_8(c128, a), x), sp),
+                      sm),
+                  v128_max_s8(
+                      v128_min_s8(v128_ssub_s8(v128_add_8(c128, f), x), sp),
+                      sm)),
+              2),
+          v128_add_8(
+              v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, b), x), sp),
+                          sm),
+              v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, e), x), sp),
+                          sm))),
+      v128_add_8(v128_add_8(tmp, tmp), tmp));
+  return v128_add_8(
+      o,
+      v128_shr_s8(
+          v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
+          4));
+}
 
-    if (!x0) {  // Clip left
-      const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
-                                         v64_from_64(0x0504030201000000LL));
-      const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
-                                         v64_from_64(0x0605040302010000LL));
-      int y;
+// Process blocks of width 8, two lines at a time, 8 bit.
+static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
+                        int dstride, int x0, int y0, int sizey, int width,
+                        int height, unsigned int strength) {
+  const int bottom = height - 2 - y0;
+  const int right = width - 8 - x0;
+  const v128 sp = v128_dup_8(strength);
+  const v128 sm = v128_dup_8(-(int)strength);
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
+  int y;
 
-      for (y = 0; y < sizey; y += 2) {
-        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
-        v128 o = v128_from_v64(l1, l2);
-        const v128 x = v128_add_8(c128, o);
-        const v128 a = v128_add_8(
-            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
-        const v128 b = v128_shuffle_8(x, b_shuff);
-        const v128 c = v128_shuffle_8(x, c_shuff);
-        const v128 d = v128_add_8(
-            c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
-        const v128 e = v128_add_8(
-            c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
-        const v128 f = v128_add_8(
-            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
 
-        const v128 tmp =
-            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
-                       v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
-        const v128 delta = v128_add_8(
-            v128_add_8(
-                v128_shl_8(
-                    v128_add_8(
-                        v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
-                        v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
-                    2),
-                v128_add_8(
-                    v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
-                    v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
-            v128_add_8(v128_add_8(tmp, tmp), tmp));
-        o = v128_add_8(
-            o, v128_shr_s8(
-                   v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
-                                                        delta, v128_zero()))),
-                   4));
-        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
-      }
-    } else if (!(width - x0 - 8)) {  // Clip right
-      const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
-                                         v64_from_64(0x0707060504030201LL));
-      const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
-                                         v64_from_64(0x0707070605040302LL));
-      int y;
+  for (y = 0; y < sizey; y += 2) {
+    const v64 l1 = v64_load_aligned(src);
+    const v64 l2 = v64_load_aligned(src + sstride);
+    v128 o = v128_from_v64(l1, l2);
+    const v128 a =
+        v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
+    const v128 f = v128_from_v64(
+        l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
+    v128 b, c, d, e;
 
-      for (y = 0; y < sizey; y += 2) {
-        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
-        v128 o = v128_from_v64(l1, l2);
-        const v128 x = v128_add_8(c128, o);
-        const v128 a = v128_add_8(
-            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
-        const v128 b = v128_add_8(
-            c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
-        const v128 c = v128_add_8(
-            c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
-        const v128 d = v128_shuffle_8(x, d_shuff);
-        const v128 e = v128_shuffle_8(x, e_shuff);
-        const v128 f = v128_add_8(
-            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
-
-        const v128 tmp =
-            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
-                       v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
-        const v128 delta = v128_add_8(
-            v128_add_8(
-                v128_shl_8(
-                    v128_add_8(
-                        v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
-                        v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
-                    2),
-                v128_add_8(
-                    v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
-                    v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
-            v128_add_8(v128_add_8(tmp, tmp), tmp));
-        o = v128_add_8(
-            o, v128_shr_s8(
-                   v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
-                                                        delta, v128_zero()))),
-                   4));
-        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
-      }
-    } else {  // No left/right clipping
-      int y;
-      for (y = 0; y < sizey; y += 2) {
-        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
-        v128 o = v128_from_v64(l1, l2);
-        const v128 x = v128_add_8(c128, o);
-        const v128 a = v128_add_8(
-            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
-        const v128 b = v128_add_8(
-            c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
-        const v128 c = v128_add_8(
-            c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
-        const v128 d = v128_add_8(
-            c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
-        const v128 e = v128_add_8(
-            c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
-        const v128 f = v128_add_8(
-            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
-
-        const v128 tmp =
-            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
-                       v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
-        const v128 delta = v128_add_8(
-            v128_add_8(
-                v128_shl_8(
-                    v128_add_8(
-                        v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
-                        v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
-                    2),
-                v128_add_8(
-                    v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
-                    v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
-            v128_add_8(v128_add_8(tmp, tmp), tmp));
-        o = v128_add_8(
-            o, v128_shr_s8(
-                   v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
-                                                        delta, v128_zero()))),
-                   4));
-        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
-      }
+    if (x0) {
+      b = v128_from_v64(v64_load_unaligned(src - 2),
+                        v64_load_unaligned(src - 2 + sstride));
+      c = v128_from_v64(v64_load_unaligned(src - 1),
+                        v64_load_unaligned(src - 1 + sstride));
+    } else {  // Left clipping
+      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
     }
+    if (right) {
+      d = v128_from_v64(v64_load_unaligned(src + 1),
+                        v64_load_unaligned(src + 1 + sstride));
+      e = v128_from_v64(v64_load_unaligned(src + 2),
+                        v64_load_unaligned(src + 2 + sstride));
+    } else {  // Right clipping
+      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
+    }
+
+    o = calc_delta(o, a, b, c, d, e, f, sp, sm);
+    v64_store_aligned(dst, v128_high_v64(o));
+    v64_store_aligned(dst + dstride, v128_low_v64(o));
+    src += sstride * 2;
+    dst += dstride * 2;
   }
 }
 
-void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
-                               int x0, int y0, int sizex, int sizey, int width,
-                               int height, unsigned int strength) {
-  // TODO(stemidts):
-  // A sizex different from 8 will only be needed if CLPF is extended to chroma.
-  // This will only be used if 4:2:0 and width not a multiple of 16 and along
-  // the right edge only, so we can fall back to the plain C implementation in
-  // this case.  If not extended to chroma, this test will be redundant.
-  if (sizex != 8 || width < 16) {  // Fallback to C if frame width < 16
-    aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
-                     strength);
-  } else {
-    clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
+// Process blocks of width 4, four lines at a time, 8 bit.
+static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
+                        int dstride, int x0, int y0, int sizey, int width,
+                        int height, unsigned int strength) {
+  const v128 sp = v128_dup_8(strength);
+  const v128 sm = v128_dup_8(-(int)strength);
+  const int right = width - 4 - x0;
+  const int bottom = height - 4 - y0;
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  b_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  c_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  d_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  e_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
+  int y;
+
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
+
+  for (y = 0; y < sizey; y += 4) {
+    const uint32_t l0 = u32_load_aligned(src - (y != -y0) * sstride);
+    const uint32_t l1 = u32_load_aligned(src);
+    const uint32_t l2 = u32_load_aligned(src + sstride);
+    const uint32_t l3 = u32_load_aligned(src + 2 * sstride);
+    const uint32_t l4 = u32_load_aligned(src + 3 * sstride);
+    const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
+    v128 o = v128_from_32(l1, l2, l3, l4);
+    const v128 a = v128_from_32(l0, l1, l2, l3);
+    const v128 f = v128_from_32(l2, l3, l4, l5);
+    v128 b, c, d, e;
+
+    if (x0) {
+      b = v128_from_32(u32_load_unaligned(src - 2),
+                       u32_load_unaligned(src + sstride - 2),
+                       u32_load_unaligned(src + 2 * sstride - 2),
+                       u32_load_unaligned(src + 3 * sstride - 2));
+      c = v128_from_32(u32_load_unaligned(src - 1),
+                       u32_load_unaligned(src + sstride - 1),
+                       u32_load_unaligned(src + 2 * sstride - 1),
+                       u32_load_unaligned(src + 3 * sstride - 1));
+    } else {  // Left clipping
+      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
+    }
+    if (right) {
+      d = v128_from_32(u32_load_unaligned(src + 1),
+                       u32_load_unaligned(src + sstride + 1),
+                       u32_load_unaligned(src + 2 * sstride + 1),
+                       u32_load_unaligned(src + 3 * sstride + 1));
+      e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
+                       u32_load_unaligned(src + sstride + 2),
+                       u32_load_unaligned(src + 2 * sstride + 2),
+                       u32_load_unaligned(src + 3 * sstride + 2));
+    } else {  // Right clipping
+      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
+    }
+
+    o = calc_delta(o, a, b, c, d, e, f, sp, sm);
+    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
+    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
+    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
+    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
+
+    dst += 4 * dstride;
+    src += 4 * sstride;
   }
 }
+
+void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
+                               int dstride, int x0, int y0, int sizex,
+                               int sizey, int width, int height,
+                               unsigned int strength) {
+  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block widths not 4 or 8
+    // * block heights not a multiple of 4 if the block width is 4
+    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
+                     height, strength);
+  } else {
+    (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, y0,
+                                             sizey, width, height, strength);
+  }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
+//         3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
+//         1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
+SIMD_INLINE v128 calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+                                v128 f, v128 sp, v128 sm) {
+  const v128 c8 = v128_dup_16(8);
+  const v128 tmp =
+      v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm),
+                  v128_max_s16(v128_min_s16(v128_sub_16(d, o), sp), sm));
+  const v128 delta = v128_add_16(
+      v128_add_16(
+          v128_shl_16(
+              v128_add_16(
+                  v128_max_s16(v128_min_s16(v128_sub_16(a, o), sp), sm),
+                  v128_max_s16(v128_min_s16(v128_sub_16(f, o), sp), sm)),
+              2),
+          v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm),
+                      v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))),
+      v128_add_16(v128_add_16(tmp, tmp), tmp));
+  return v128_add_16(
+      o, v128_shr_s16(
+             v128_add_16(
+                 c8, v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
+             4));
+}
+
+static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+                            v128 f, uint16_t *dst, v128 sp, v128 sm,
+                            int dstride) {
+  o = calc_delta_hbd(o, a, b, c, d, e, f, sp, sm);
+  v64_store_aligned(dst, v128_high_v64(o));
+  v64_store_aligned(dst + dstride, v128_low_v64(o));
+}
+
+static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+                            v128 f, uint16_t *dst, v128 sp, v128 sm) {
+  v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, sp, sm));
+}
+
+// Process blocks of width 4, two lines at time.
+SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
+                                 int sstride, int dstride, int x0, int y0,
+                                 int sizey, int width, int height,
+                                 unsigned int strength) {
+  const v128 sp = v128_dup_16(strength);
+  const v128 sm = v128_dup_16(-(int)strength);
+  const int right = width - 4 - x0;
+  const int bottom = height - 2 - y0;
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  d_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  e_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
+  int y;
+
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
+
+  for (y = 0; y < sizey; y += 2) {
+    const v64 l1 = v64_load_aligned(src);
+    const v64 l2 = v64_load_aligned(src + sstride);
+    v128 o = v128_from_v64(l1, l2);
+    const v128 a =
+        v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
+    const v128 f = v128_from_v64(
+        l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
+    v128 b, c, d, e;
+
+    if (x0) {
+      b = v128_from_v64(v64_load_unaligned(src - 2),
+                        v64_load_unaligned(src - 2 + sstride));
+      c = v128_from_v64(v64_load_unaligned(src - 1),
+                        v64_load_unaligned(src - 1 + sstride));
+    } else {  // Left clipping
+      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
+    }
+    if (right) {
+      d = v128_from_v64(v64_load_unaligned(src + 1),
+                        v64_load_unaligned(src + 1 + sstride));
+      e = v128_from_v64(v64_load_unaligned(src + 2),
+                        v64_load_unaligned(src + 2 + sstride));
+    } else {  // Right clipping
+      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
+    }
+    calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
+    src += sstride * 2;
+    dst += dstride * 2;
+  }
+}
+
+// The most simple case.  Start here if you need to understand the functions.
+SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
+                                int dstride, int x0, int y0, int sizey,
+                                int width, int height, unsigned int strength) {
+  const v128 sp = v128_dup_16(strength);
+  const v128 sm = v128_dup_16(-(int)strength);
+  const int right = width - 8 - x0;
+  const int bottom = height - 2 - y0;
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  d_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  e_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL };
+  int y;
+
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
+
+  // Read 8 set of pixels at a time.  Clipping along upper and lower
+  // edges is handled by reading the upper or lower line twice.
+  // Clipping along the left and right edges is handled by shuffle
+  // instructions doing shift and pad.
+  for (y = 0; y < sizey; y++) {
+    const v128 o = v128_load_aligned(src);
+    const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
+    const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
+    v128 b, c, d, e;
+
+    if (x0) {
+      b = v128_load_unaligned(src - 2);
+      c = v128_load_unaligned(src - 1);
+    } else {  // Left clipping
+      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
+    }
+    if (right) {
+      d = v128_load_unaligned(src + 1);
+      e = v128_load_unaligned(src + 2);
+    } else {  // Right clipping
+      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
+    }
+    calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
+    src += sstride;
+    dst += dstride;
+  }
+}
+
+void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
+                                   int sstride, int dstride, int x0, int y0,
+                                   int sizex, int sizey, int width, int height,
+                                   unsigned int strength) {
+  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block width not 4 or 8
+    // * block heights not a multiple of 2 if the block width is 4
+    aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
+                         width, height, strength);
+  } else {
+    (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
+        src, dst, sstride, dstride, x0, y0, sizey, width, height, strength);
+  }
+}
+#endif
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 2bb680a..98f4f51 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -153,7 +153,9 @@
 #if CONFIG_CLPF
   int clpf_numblocks;
   int clpf_size;
-  int clpf_strength;
+  int clpf_strength_y;
+  int clpf_strength_u;
+  int clpf_strength_v;
   uint8_t *clpf_blocks;
 #endif
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index dc18944..7daeb5d 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -29,6 +29,7 @@
 
 #include "av1/common/alloccommon.h"
 #if CONFIG_CLPF
+#include "aom/aom_image.h"
 #include "av1/common/clpf.h"
 #endif
 #include "av1/common/common.h"
@@ -2046,8 +2047,10 @@
 #if CONFIG_CLPF
 static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   cm->clpf_blocks = 0;
-  cm->clpf_strength = aom_rb_read_literal(rb, 2);
-  if (cm->clpf_strength) {
+  cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
+  cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
+  cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
+  if (cm->clpf_strength_y) {
     cm->clpf_size = aom_rb_read_literal(rb, 2);
     if (cm->clpf_size) {
       int i;
@@ -3928,20 +3931,23 @@
 #endif  // CONFIG_LOOP_RESTORATION
 
 #if CONFIG_CLPF
-  if (cm->clpf_strength && !cm->skip_loop_filter) {
-    YV12_BUFFER_CONFIG dst;  // Buffer for the result
-
-    dst = pbi->cur_buf->buf;
-    CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
-
-    av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
-                   cm->clpf_strength + (cm->clpf_strength == 3),
-                   4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
-
-    // Copy result
-    memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
-           dst.y_height * dst.y_stride);
-    aom_free(dst.y_buffer);
+  if (!cm->skip_loop_filter) {
+    const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
+    if (cm->clpf_strength_y) {
+      av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
+                     cm->clpf_strength_y + (cm->clpf_strength_y == 3),
+                     4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
+    }
+    if (cm->clpf_strength_u) {
+      av1_clpf_frame(frame, NULL, cm, 0,
+                     cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
+                     AOM_PLANE_U, NULL);
+    }
+    if (cm->clpf_strength_v) {
+      av1_clpf_frame(frame, NULL, cm, 0,
+                     cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
+                     AOM_PLANE_V, NULL);
+    }
   }
   if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
 #endif
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index dab1008..1438a56 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -1454,6 +1454,7 @@
         };
 
         mi->bmi[j].as_mv[0].as_int = block[0].as_int;
+        mi->bmi[j].as_mode = b_mode;
         if (is_compound) mi->bmi[j].as_mv[1].as_int = block[1].as_int;
 
         if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 2b846e8..6774bb2 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -662,7 +662,7 @@
 
         if (t != ONE_TOKEN) {
           int len = UNCONSTRAINED_NODES - p->skip_eob_node;
-          av1_write_tree(w, av1_coef_con_tree,
+          aom_write_tree(w, av1_coef_con_tree,
                          av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1], v,
                          n - len, 0);
         }
@@ -836,7 +836,7 @@
                              const struct segmentation_probs *segp,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
-    av1_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0);
+    aom_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0);
 }
 
 // This function encodes the reference frame
@@ -2590,8 +2590,10 @@
 
 #if CONFIG_CLPF
 static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
-  aom_wb_write_literal(wb, cm->clpf_strength, 2);
-  if (cm->clpf_strength) {
+  aom_wb_write_literal(wb, cm->clpf_strength_y, 2);
+  aom_wb_write_literal(wb, cm->clpf_strength_u, 2);
+  aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
+  if (cm->clpf_strength_y) {
     aom_wb_write_literal(wb, cm->clpf_size, 2);
     if (cm->clpf_size) {
       int i;
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 2156032..57b42a8 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -164,9 +164,6 @@
   // Store the second best motion vector during full-pixel motion search
   int_mv second_best_mv;
 
-  // Strong color activity detection. Used in RTC coding mode to enhance
-  // the visual quality at the boundary of moving color objects.
-  uint8_t color_sensitivity[2];
 
   // use default transform and skip transform type search for intra modes
   int use_default_intra_tx_type;
diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
index 4221505..1d498f1 100644
--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c
@@ -11,16 +11,17 @@
 
 #include "av1/common/clpf.h"
 #include "./aom_dsp_rtcd.h"
+#include "aom/aom_image.h"
 #include "aom/aom_integer.h"
 #include "av1/common/quant_common.h"
 
 // Calculate the error of a filtered and unfiltered block
 void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
                        int ostride, int x0, int y0, int width, int height,
-                       int *sum0, int *sum1, unsigned int strength) {
+                       int *sum0, int *sum1, unsigned int strength, int size) {
   int x, y;
-  for (y = y0; y < y0 + 8; y++) {
-    for (x = x0; x < x0 + 8; x++) {
+  for (y = y0; y < y0 + size; y++) {
+    for (x = x0; x < x0 + size; x++) {
       int O = org[y * ostride + x];
       int X = rec[y * rstride + x];
       int A = rec[AOMMAX(0, y - 1) * rstride + x];
@@ -39,11 +40,11 @@
 
 void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
                              int rstride, int ostride, int x0, int y0,
-                             int width, int height, int *sum) {
+                             int width, int height, int *sum, int size) {
   int x, y;
 
-  for (y = y0; y < y0 + 8; y++) {
-    for (x = x0; x < x0 + 8; x++) {
+  for (y = y0; y < y0 + size; y++) {
+    for (x = x0; x < x0 + size; x++) {
       int O = org[y * ostride + x];
       int X = rec[y * rstride + x];
       int A = rec[AOMMAX(0, y - 1) * rstride + x];
@@ -66,21 +67,94 @@
   }
 }
 
+#if CONFIG_AOM_HIGHBITDEPTH
+// Identical to aom_clpf_detect_c() apart from "rec" and "org".
+void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
+                           int rstride, int ostride, int x0, int y0, int width,
+                           int height, int *sum0, int *sum1,
+                           unsigned int strength, int shift, int size) {
+  int x, y;
+  for (y = y0; y < y0 + size; y++) {
+    for (x = x0; x < x0 + size; x++) {
+      int O = org[y * ostride + x] >> shift;
+      int X = rec[y * rstride + x] >> shift;
+      int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
+      int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
+      int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
+      int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
+      int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
+      int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
+      int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength >> shift);
+      int Y = X + delta;
+      *sum0 += (O - X) * (O - X);
+      *sum1 += (O - Y) * (O - Y);
+    }
+  }
+}
+
+// aom_clpf_detect_multi_c() apart from "rec" and "org".
+void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
+                                 int rstride, int ostride, int x0, int y0,
+                                 int width, int height, int *sum, int shift,
+                                 int size) {
+  int x, y;
+
+  for (y = y0; y < y0 + size; y++) {
+    for (x = x0; x < x0 + size; x++) {
+      int O = org[y * ostride + x] >> shift;
+      int X = rec[y * rstride + x] >> shift;
+      int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
+      int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
+      int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
+      int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
+      int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
+      int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
+      int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1);
+      int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2);
+      int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4);
+      int F1 = X + delta1;
+      int F2 = X + delta2;
+      int F3 = X + delta3;
+      sum[0] += (O - X) * (O - X);
+      sum[1] += (O - F1) * (O - F1);
+      sum[2] += (O - F2) * (O - F2);
+      sum[3] += (O - F3) * (O - F3);
+    }
+  }
+}
+#endif
+
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
                       unsigned int fb_size_log2, uint8_t *res) {
   int m, n, sum0 = 0, sum1 = 0;
+
   for (m = 0; m < h; m++) {
     for (n = 0; n < w; n++) {
       int xpos = (l << fb_size_log2) + n * block_size;
       int ypos = (k << fb_size_log2) + m * block_size;
-      const int bs = MAX_MIB_SIZE;
-      if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
-               ->mbmi.skip)
+      if (!cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
+               ->mbmi.skip) {
+#if CONFIG_AOM_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
+                              CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
+                              org->y_stride, xpos, ypos, rec->y_crop_width,
+                              rec->y_crop_height, &sum0, &sum1, strength,
+                              cm->bit_depth - 8, block_size);
+        } else {
+          aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
+                          org->y_stride, xpos, ypos, rec->y_crop_width,
+                          rec->y_crop_height, &sum0, &sum1, strength,
+                          block_size);
+        }
+#else
         aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
                         org->y_stride, xpos, ypos, rec->y_crop_width,
-                        rec->y_crop_height, &sum0, &sum1, strength);
+                        rec->y_crop_height, &sum0, &sum1, strength, block_size);
+#endif
+      }
     }
   }
   *res = sum1 < sum0;
@@ -90,6 +164,7 @@
 // Calculate the square error of all filter settings.  Result:
 // res[0][0]   : unfiltered
 // res[0][1-3] : strength=1,2,4, no signals
+// (Only for luma:)
 // res[1][0]   : (bit count, fb size = 128)
 // res[1][1-3] : strength=1,2,4, fb size = 128
 // res[2][0]   : (bit count, fb size = 64)
@@ -99,12 +174,28 @@
 static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
                     const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                     unsigned int block_size, unsigned int fb_size_log2, int w,
-                    int h, int64_t res[4][4]) {
+                    int h, int64_t res[4][4], int plane) {
   int c, m, n, filtered = 0;
   int sum[4];
+  const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
+  const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
   int bslog = get_msb(block_size);
+  uint8_t *rec_buffer =
+      plane != AOM_PLANE_Y
+          ? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
+          : rec->y_buffer;
+  uint8_t *org_buffer =
+      plane != AOM_PLANE_Y
+          ? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
+          : org->y_buffer;
+  int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
+  int rec_height =
+      plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
+  int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
+  int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
   sum[0] = sum[1] = sum[2] = sum[3] = 0;
-  if (fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
+  if (plane == AOM_PLANE_Y &&
+      fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
     int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
 
     fb_size_log2--;
@@ -119,16 +210,17 @@
     oldfiltered = res[i][0];
     res[i][0] = 0;
 
-    filtered =
-        clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, res);
+    filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
+                        res, plane);
     if (1 << (fb_size_log2 - bslog) < w)
       filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
-                           fb_size_log2, w2, h1, res);
+                           fb_size_log2, w2, h1, res, plane);
     if (1 << (fb_size_log2 - bslog) < h) {
       filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
-                           fb_size_log2, w1, h2, res);
-      filtered |= clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2),
-                           rec, org, cm, block_size, fb_size_log2, w2, h2, res);
+                           fb_size_log2, w1, h2, res, plane);
+      filtered |=
+          clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org,
+                   cm, block_size, fb_size_log2, w2, h2, res, plane);
     }
 
     res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
@@ -142,18 +234,31 @@
     for (n = 0; n < w; n++) {
       int xpos = x + n * block_size;
       int ypos = y + m * block_size;
-      if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride +
-                               xpos / MAX_MIB_SIZE]
+      if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+                               (xpos << subx) / MI_SIZE]
                ->mbmi.skip) {
-        aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride,
-                              org->y_stride, xpos, ypos, rec->y_crop_width,
-                              rec->y_crop_height, sum);
+#if CONFIG_AOM_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          aom_clpf_detect_multi_hbd(
+              CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
+              rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
+              cm->bit_depth - 8, block_size);
+        } else {
+          aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+                                xpos, ypos, rec_width, rec_height, sum,
+                                block_size);
+        }
+#else
+        aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+                              xpos, ypos, rec_width, rec_height, sum,
+                              block_size);
+#endif
         filtered = 1;
       }
     }
   }
 
-  for (c = 0; c < 4; c++) {
+  for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) {
     res[c][0] += sum[0];
     res[c][1] += sum[1];
     res[c][2] += sum[2];
@@ -164,59 +269,69 @@
 
 void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                         int *best_strength, int *best_bs) {
+                         int *best_strength, int *best_bs, int plane) {
   int c, j, k, l;
   int64_t best, sums[4][4];
-  int width = rec->y_crop_width, height = rec->y_crop_height;
-  const int bs = MAX_MIB_SIZE;
+  int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
+  int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
+  const int bs = MI_SIZE;
+  const int bslog = get_msb(bs);
   int fb_size_log2 = get_msb(MAX_FB_SIZE);
   int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
   int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
 
   memset(sums, 0, sizeof(sums));
 
-  for (k = 0; k < num_fb_ver; k++) {
-    for (l = 0; l < num_fb_hor; l++) {
-      // Calculate the block size after frame border clipping
-      int h =
-          AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
-      int w =
-          AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
-      h += !h << fb_size_log2;
-      w += !w << fb_size_log2;
-      clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, bs,
-               fb_size_log2, w / bs, h / bs, sums);
+  if (plane != AOM_PLANE_Y)
+    // Use a block size of MI_SIZE regardless of the subsampling.  This
+    // This is accurate enough to determine the best strength and
+    // we don't need to add SIMD optimisations for 4x4 blocks.
+    clpf_rdo(0, 0, rec, org, cm, bs, fb_size_log2, width >> bslog,
+             height >> bslog, sums, plane);
+  else
+    for (k = 0; k < num_fb_ver; k++) {
+      for (l = 0; l < num_fb_hor; l++) {
+        // Calculate the block size after frame border clipping
+        int h =
+            AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
+        int w =
+            AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
+        h += !h << fb_size_log2;
+        w += !w << fb_size_log2;
+        clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, MI_SIZE,
+                 fb_size_log2, w >> bslog, h >> bslog, sums, plane);
+      }
     }
-  }
+
+  if (plane != AOM_PLANE_Y)  // Slightly favour unfiltered chroma
+    sums[0][0] -= sums[0][0] >> 7;
 
   for (j = 0; j < 4; j++) {
     static const double lambda_square[] = {
-      // exp((i - 15.4244) / 8.4010)
-      0.159451, 0.179607, 0.202310, 0.227884, 0.256690, 0.289138, 0.325687,
-      0.366856, 0.413230, 0.465465, 0.524303, 0.590579, 0.665233, 0.749323,
-      0.844044, 0.950737, 1.070917, 1.206289, 1.358774, 1.530533, 1.724004,
-      1.941931, 2.187406, 2.463911, 2.775368, 3.126195, 3.521370, 3.966498,
-      4.467893, 5.032669, 5.668837, 6.385421, 7.192586, 8.101784, 9.125911,
-      10.27949, 11.57890, 13.04256, 14.69124, 16.54832, 18.64016, 20.99641,
-      23.65052, 26.64013, 30.00764, 33.80084, 38.07352, 42.88630, 48.30746,
-      54.41389, 61.29221, 69.04002, 77.76720, 87.59756, 98.67056, 111.1432,
-      125.1926, 141.0179, 158.8436, 178.9227, 201.5399, 227.0160, 255.7126,
-      288.0366
+      // exp(x / 8.5)
+      1.0000, 1.1248, 1.2653, 1.4232, 1.6009, 1.8008, 2.0256, 2.2785,
+      2.5630, 2.8830, 3.2429, 3.6478, 4.1032, 4.6155, 5.1917, 5.8399,
+      6.5689, 7.3891, 8.3116, 9.3492, 10.516, 11.829, 13.306, 14.967,
+      16.836, 18.938, 21.302, 23.962, 26.953, 30.318, 34.103, 38.361,
+      43.151, 48.538, 54.598, 61.414, 69.082, 77.706, 87.408, 98.320,
+      110.59, 124.40, 139.93, 157.40, 177.05, 199.16, 224.02, 251.99,
+      283.45, 318.84, 358.65, 403.42, 453.79, 510.45, 574.17, 645.86,
+      726.49, 817.19, 919.22, 1033.9, 1163.0, 1308.2, 1471.6, 1655.3
     };
 
     // Estimate the bit costs and adjust the square errors
     double lambda =
         lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2];
-    int i, cost = (int)((1.2 * lambda * (sums[j][0] + 2 + 2 * (j > 0)) + 0.5));
+    int i, cost = (int)((lambda * (sums[j][0] + 6 + 2 * (j > 0)) + 0.5));
     for (i = 0; i < 4; i++)
       sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i;
   }
 
   best = (int64_t)1 << 62;
-  for (c = 0; c < 4; c++)
+  for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++)
     for (j = 0; j < 4; j++)
       if ((!c || j) && sums[c][j] < best) best = sums[c][j];
   best &= 15;
-  *best_bs = (best > 3) * (5 + (best < 12) + (best < 8));
+  if (best_bs) *best_bs = (best > 3) * (5 + (best < 12) + (best < 8));
   *best_strength = best ? 1 << ((best - 1) & 3) : 0;
 }
diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
index 3dd5478..bb85fbc 100644
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h
@@ -21,6 +21,6 @@
 
 void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                         int *best_strength, int *best_bs);
+                         int *best_strength, int *best_bs, int plane);
 
 #endif
diff --git a/av1/encoder/clpf_rdo_simd.h b/av1/encoder/clpf_rdo_simd.h
index abbbe7c..7c07329 100644
--- a/av1/encoder/clpf_rdo_simd.h
+++ b/av1/encoder/clpf_rdo_simd.h
@@ -9,496 +9,278 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "./aom_dsp_rtcd.h"
 #include "aom_dsp/aom_simd.h"
+#include "aom_ports/mem.h"
+
+SIMD_INLINE void calc_diff(v128 o, v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
+                           v128 *f) {
+  // The difference will be 9 bit, offset by 128 so we can use saturated
+  // sub to avoid going to 16 bit temporarily before "strength" clipping.
+  const v128 c128 = v128_dup_8(128);
+  v128 x = v128_add_8(c128, o);
+  *a = v128_ssub_s8(v128_add_8(c128, *a), x);
+  *b = v128_ssub_s8(v128_add_8(c128, *b), x);
+  *c = v128_ssub_s8(v128_add_8(c128, *c), x);
+  *d = v128_ssub_s8(v128_add_8(c128, *d), x);
+  *e = v128_ssub_s8(v128_add_8(c128, *e), x);
+  *f = v128_ssub_s8(v128_add_8(c128, *f), x);
+}
+
+SIMD_INLINE v128 delta_kernel(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+                              v128 f, v128 sp, v128 sm) {
+  const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(c, sp), sm),
+                              v128_max_s8(v128_min_s8(d, sp), sm));
+  const v128 delta = v128_add_8(
+      v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, sp), sm),
+                                       v128_max_s8(v128_min_s8(f, sp), sm)),
+                            2),
+                 v128_add_8(v128_max_s8(v128_min_s8(b, sp), sm),
+                            v128_max_s8(v128_min_s8(e, sp), sm))),
+      v128_add_8(v128_add_8(tmp, tmp), tmp));
+
+  return v128_add_8(
+      o, v128_shr_s8(
+             v128_add_8(v128_dup_8(8),
+                        v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
+             4));
+}
+
+SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+                            v128 f, v128 sp, v128 sm) {
+  calc_diff(o, &a, &b, &c, &d, &e, &f);
+  return delta_kernel(o, a, b, c, d, e, f, sp, sm);
+}
+
+SIMD_INLINE void clip_sides(v128 *b, v128 *c, v128 *d, v128 *e, int left,
+                            int right) {
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
+  DECLARE_ALIGNED(16, static const uint64_t,
+                  e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
+
+  if (!left) {  // Left clipping
+    *b = v128_shuffle_8(*b, v128_load_aligned(b_shuff));
+    *c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
+  }
+  if (!right) {  // Right clipping
+    *d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
+    *e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
+  }
+}
+
+SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
+                                int rstride, int ostride, int x0, int y0,
+                                int bottom, int right, int y, v128 *o, v128 *r,
+                                v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
+                                v128 *f) {
+  const v64 k1 = v64_load_aligned(org);
+  const v64 k2 = v64_load_aligned(org + ostride);
+  const v64 l1 = v64_load_aligned(rec);
+  const v64 l2 = v64_load_aligned(rec + rstride);
+  *o = v128_from_v64(k1, k2);
+  *r = v128_from_v64(l1, l2);
+  *a = v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1);
+  *f = v128_from_v64(l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride));
+  *b = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
+                     v64_load_unaligned(rec - 2 * !!x0 + rstride));
+  *c = v128_from_v64(v64_load_unaligned(rec - !!x0),
+                     v64_load_unaligned(rec - !!x0 + rstride));
+  *d = v128_from_v64(v64_load_unaligned(rec + !!right),
+                     v64_load_unaligned(rec + !!right + rstride));
+  *e = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
+                     v64_load_unaligned(rec + 2 * !!right + rstride));
+  clip_sides(b, c, d, e, x0, right);
+}
 
 void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
                                 int rstride, int ostride, int x0, int y0,
                                 int width, int height, int *sum0, int *sum1,
-                                unsigned int strength) {
-  ssd128_internal ssd0 = v128_ssd_u8_init();
-  ssd128_internal ssd1 = v128_ssd_u8_init();
-  const v128 c128 = v128_dup_8(128);
+                                unsigned int strength, int size) {
   const v128 sp = v128_dup_8(strength);
   const v128 sm = v128_dup_8(-(int)strength);
+  const int right = width - 8 - x0;
   const int bottom = height - 2 - y0;
+  ssd128_internal ssd0 = v128_ssd_u8_init();
+  ssd128_internal ssd1 = v128_ssd_u8_init();
+  int y;
+
+  if (size != 8) {  // Fallback to plain C
+    aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
+                      sum1, strength, size);
+    return;
+  }
 
   rec += x0 + y0 * rstride;
   org += x0 + y0 * ostride;
 
-  if (!x0) {  // Clip left
-    const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
-                                       v64_from_64(0x0504030201000000LL));
-    const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
-                                       v64_from_64(0x0605040302010000LL));
-    int y;
-
-    for (y = 0; y < 8; y += 2) {
-      const v64 k1 = v64_load_aligned(org);
-      const v64 k2 = v64_load_aligned(org + ostride);
-      const v64 l1 = v64_load_aligned(rec);
-      const v64 l2 = v64_load_aligned(rec + rstride);
-      v128 o = v128_from_v64(k1, k2);
-      const v128 q = v128_from_v64(l1, l2);
-      const v128 x = v128_add_8(c128, q);
-      const v128 a = v128_add_8(
-          c128,
-          v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
-      const v128 b = v128_shuffle_8(x, b_shuff);
-      const v128 c = v128_shuffle_8(x, c_shuff);
-      const v128 d = v128_add_8(
-          c128, v128_from_v64(v64_load_unaligned(rec + 1),
-                              v64_load_unaligned(rec + 1 + rstride)));
-      const v128 e = v128_add_8(
-          c128, v128_from_v64(v64_load_unaligned(rec + 2),
-                              v64_load_unaligned(rec + 2 + rstride)));
-      const v128 f = v128_add_8(
-          c128, v128_from_v64(
-                    l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-
-      const v128 tmp =
-          v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
-                     v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
-      v128 delta = v128_add_8(
-          v128_add_8(
-              v128_shl_8(
-                  v128_add_8(
-                      v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
-                      v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
-                  2),
-              v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
-                         v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-
-      delta = v128_shr_s8(
-          v128_add_8(v128_dup_8(8),
-                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
-          4);
-      ssd0 = v128_ssd_u8(ssd0, o, q);
-      ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
-      rec += rstride * 2;
-      org += ostride * 2;
-    }
-  } else if (!(width - x0 - 8)) {  // Clip right
-    const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
-                                       v64_from_64(0x0707060504030201LL));
-    const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
-                                       v64_from_64(0x0707070605040302LL));
-    int y;
-
-    for (y = 0; y < 8; y += 2) {
-      const v64 k1 = v64_load_aligned(org);
-      const v64 k2 = v64_load_aligned(org + ostride);
-      const v64 l1 = v64_load_aligned(rec);
-      const v64 l2 = v64_load_aligned(rec + rstride);
-      v128 o = v128_from_v64(k1, k2);
-      const v128 q = v128_from_v64(l1, l2);
-      const v128 x = v128_add_8(c128, q);
-      const v128 a = v128_add_8(
-          c128,
-          v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
-      const v128 b = v128_add_8(
-          c128, v128_from_v64(v64_load_unaligned(rec - 2),
-                              v64_load_unaligned(rec - 2 + rstride)));
-      const v128 c = v128_add_8(
-          c128, v128_from_v64(v64_load_unaligned(rec - 1),
-                              v64_load_unaligned(rec - 1 + rstride)));
-      const v128 d = v128_shuffle_8(x, d_shuff);
-      const v128 e = v128_shuffle_8(x, e_shuff);
-      const v128 f = v128_add_8(
-          c128, v128_from_v64(
-                    l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-
-      const v128 tmp =
-          v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
-                     v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
-      v128 delta = v128_add_8(
-          v128_add_8(
-              v128_shl_8(
-                  v128_add_8(
-                      v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
-                      v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
-                  2),
-              v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
-                         v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-      delta = v128_shr_s8(
-          v128_add_8(v128_dup_8(8),
-                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
-          4);
-      ssd0 = v128_ssd_u8(ssd0, o, q);
-      ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
-      rec += rstride * 2;
-      org += ostride * 2;
-    }
-  } else {  // No left/right clipping
-    int y;
-    for (y = 0; y < 8; y += 2) {
-      const v64 k1 = v64_load_aligned(org);
-      const v64 k2 = v64_load_aligned(org + ostride);
-      const v64 l1 = v64_load_aligned(rec);
-      const v64 l2 = v64_load_aligned(rec + rstride);
-      v128 o = v128_from_v64(k1, k2);
-      const v128 q = v128_from_v64(l1, l2);
-      const v128 x = v128_add_8(c128, q);
-      const v128 a = v128_add_8(
-          c128,
-          v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
-      const v128 b = v128_add_8(
-          c128, v128_from_v64(v64_load_unaligned(rec - 2),
-                              v64_load_unaligned(rec - 2 + rstride)));
-      const v128 c = v128_add_8(
-          c128, v128_from_v64(v64_load_unaligned(rec - 1),
-                              v64_load_unaligned(rec - 1 + rstride)));
-      const v128 d = v128_add_8(
-          c128, v128_from_v64(v64_load_unaligned(rec + 1),
-                              v64_load_unaligned(rec + 1 + rstride)));
-      const v128 e = v128_add_8(
-          c128, v128_from_v64(v64_load_unaligned(rec + 2),
-                              v64_load_unaligned(rec + 2 + rstride)));
-      const v128 f = v128_add_8(
-          c128, v128_from_v64(
-                    l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-
-      const v128 tmp =
-          v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
-                     v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
-      v128 delta = v128_add_8(
-          v128_add_8(
-              v128_shl_8(
-                  v128_add_8(
-                      v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
-                      v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
-                  2),
-              v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
-                         v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-      delta = v128_shr_s8(
-          v128_add_8(v128_dup_8(8),
-                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
-          4);
-
-      ssd0 = v128_ssd_u8(ssd0, o, q);
-      ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
-      rec += rstride * 2;
-      org += ostride * 2;
-    }
+  for (y = 0; y < 8; y += 2) {
+    v128 a, b, c, d, e, f, o, r;
+    read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
+                   &a, &b, &c, &d, &e, &f);
+    ssd0 = v128_ssd_u8(ssd0, o, r);
+    ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
+    rec += rstride * 2;
+    org += ostride * 2;
   }
   *sum0 += v128_ssd_u8_sum(ssd0);
   *sum1 += v128_ssd_u8_sum(ssd1);
 }
 
-// Test multiple filter strengths at once.  Use a simpler filter (4 tap, every
-// second line).
+SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
+                                  v128 d, v128 e, v128 f, ssd128_internal *ssd1,
+                                  ssd128_internal *ssd2,
+                                  ssd128_internal *ssd3) {
+  calc_diff(r, &a, &b, &c, &d, &e, &f);
+  *ssd1 = v128_ssd_u8(*ssd1, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(1),
+                                             v128_dup_8(-1)));
+  *ssd2 = v128_ssd_u8(*ssd2, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(2),
+                                             v128_dup_8(-2)));
+  *ssd3 = v128_ssd_u8(*ssd3, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(4),
+                                             v128_dup_8(-4)));
+}
+
+// Test multiple filter strengths at once.
 void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
                                       int rstride, int ostride, int x0, int y0,
-                                      int width, int height, int *sum) {
-  const v128 c128 = v128_dup_8(128);
-  const v128 cp1 = v128_dup_8(1);
-  const v128 cm1 = v128_dup_8(-1);
-  const v128 cp2 = v128_dup_8(2);
-  const v128 cm2 = v128_dup_8(-2);
-  const v128 cp4 = v128_dup_8(4);
-  const v128 cm4 = v128_dup_8(-4);
-  const v128 c8 = v128_dup_8(8);
+                                      int width, int height, int *sum,
+                                      int size) {
   const int bottom = height - 2 - y0;
+  const int right = width - 8 - x0;
   ssd128_internal ssd0 = v128_ssd_u8_init();
   ssd128_internal ssd1 = v128_ssd_u8_init();
   ssd128_internal ssd2 = v128_ssd_u8_init();
   ssd128_internal ssd3 = v128_ssd_u8_init();
+  int y;
+
+  if (size != 8) {  // Fallback to plain C
+    aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
+                            sum, size);
+    return;
+  }
 
   rec += x0 + y0 * rstride;
   org += x0 + y0 * ostride;
 
-  if (!x0) {  // Clip left
-    const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
-                                       v64_from_64(0x0504030201000000LL));
-    const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
-                                       v64_from_64(0x0605040302010000LL));
-    int y;
-
-    for (y = 0; y < 8; y += 2) {
-      const v64 k1 = v64_load_aligned(org);
-      const v64 k2 = v64_load_aligned(org + ostride);
-      const v64 l1 = v64_load_aligned(rec);
-      const v64 l2 = v64_load_aligned(rec + rstride);
-      v128 o = v128_from_v64(k1, k2);
-      const v128 q = v128_from_v64(l1, l2);
-      const v128 x = v128_add_8(c128, q);
-      v128 a = v128_add_8(
-          c128,
-          v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
-      v128 b = v128_shuffle_8(x, b_shuff);
-      v128 c = v128_shuffle_8(x, c_shuff);
-      v128 d = v128_add_8(c128,
-                          v128_from_v64(v64_load_unaligned(rec + 1),
-                                        v64_load_unaligned(rec + 1 + rstride)));
-      v128 e = v128_add_8(c128,
-                          v128_from_v64(v64_load_unaligned(rec + 2),
-                                        v64_load_unaligned(rec + 2 + rstride)));
-      v128 f = v128_add_8(
-          c128, v128_from_v64(
-                    l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-      v128 tmp, delta1, delta2, delta3;
-
-      a = v128_ssub_s8(a, x);
-      b = v128_ssub_s8(b, x);
-      c = v128_ssub_s8(c, x);
-      d = v128_ssub_s8(d, x);
-      e = v128_ssub_s8(e, x);
-      f = v128_ssub_s8(f, x);
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
-                       v128_max_s8(v128_min_s8(d, cp1), cm1));
-      delta1 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
-                                    v128_max_s8(v128_min_s8(f, cp1), cm1)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
-                         v128_max_s8(v128_min_s8(e, cp1), cm1))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
-                       v128_max_s8(v128_min_s8(d, cp2), cm2));
-      delta2 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
-                                    v128_max_s8(v128_min_s8(f, cp2), cm2)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
-                         v128_max_s8(v128_min_s8(e, cp2), cm2))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
-                       v128_max_s8(v128_min_s8(d, cp4), cm4));
-      delta3 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
-                                    v128_max_s8(v128_min_s8(f, cp4), cm4)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
-                         v128_max_s8(v128_min_s8(e, cp4), cm4))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-
-      ssd0 = v128_ssd_u8(ssd0, o, q);
-      ssd1 = v128_ssd_u8(
-          ssd1, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
-                                                        delta1, v128_zero()))),
-                  4)));
-      ssd2 = v128_ssd_u8(
-          ssd2, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
-                                                        delta2, v128_zero()))),
-                  4)));
-      ssd3 = v128_ssd_u8(
-          ssd3, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
-                                                        delta3, v128_zero()))),
-                  4)));
-      rec += 2 * rstride;
-      org += 2 * ostride;
-    }
-  } else if (!(width - x0 - 8)) {  // Clip right
-    const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
-                                       v64_from_64(0x0707060504030201LL));
-    const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
-                                       v64_from_64(0x0707070605040302LL));
-    int y;
-
-    for (y = 0; y < 8; y += 2) {
-      const v64 k1 = v64_load_aligned(org);
-      const v64 k2 = v64_load_aligned(org + ostride);
-      const v64 l1 = v64_load_aligned(rec);
-      const v64 l2 = v64_load_aligned(rec + rstride);
-      v128 o = v128_from_v64(k1, k2);
-      const v128 q = v128_from_v64(l1, l2);
-      const v128 x = v128_add_8(c128, q);
-      v128 a = v128_add_8(
-          c128,
-          v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
-      v128 b = v128_add_8(c128,
-                          v128_from_v64(v64_load_unaligned(rec - 2),
-                                        v64_load_unaligned(rec - 2 + rstride)));
-      v128 c = v128_add_8(c128,
-                          v128_from_v64(v64_load_unaligned(rec - 1),
-                                        v64_load_unaligned(rec - 1 + rstride)));
-      v128 d = v128_shuffle_8(x, d_shuff);
-      v128 e = v128_shuffle_8(x, e_shuff);
-      v128 f = v128_add_8(
-          c128, v128_from_v64(
-                    l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-      v128 tmp, delta1, delta2, delta3;
-
-      a = v128_ssub_s8(a, x);
-      b = v128_ssub_s8(b, x);
-      c = v128_ssub_s8(c, x);
-      d = v128_ssub_s8(d, x);
-      e = v128_ssub_s8(e, x);
-      f = v128_ssub_s8(f, x);
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
-                       v128_max_s8(v128_min_s8(d, cp1), cm1));
-      delta1 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
-                                    v128_max_s8(v128_min_s8(f, cp1), cm1)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
-                         v128_max_s8(v128_min_s8(e, cp1), cm1))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
-                       v128_max_s8(v128_min_s8(d, cp2), cm2));
-      delta2 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
-                                    v128_max_s8(v128_min_s8(f, cp2), cm2)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
-                         v128_max_s8(v128_min_s8(e, cp2), cm2))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
-                       v128_max_s8(v128_min_s8(d, cp4), cm4));
-      delta3 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
-                                    v128_max_s8(v128_min_s8(f, cp4), cm4)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
-                         v128_max_s8(v128_min_s8(e, cp4), cm4))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-
-      ssd0 = v128_ssd_u8(ssd0, o, q);
-      ssd1 = v128_ssd_u8(
-          ssd1, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
-                                                        delta1, v128_zero()))),
-                  4)));
-      ssd2 = v128_ssd_u8(
-          ssd2, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
-                                                        delta2, v128_zero()))),
-                  4)));
-      ssd3 = v128_ssd_u8(
-          ssd3, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
-                                                        delta3, v128_zero()))),
-                  4)));
-      rec += 2 * rstride;
-      org += 2 * ostride;
-    }
-  } else {  // No left/right clipping
-    int y;
-    for (y = 0; y < 8; y += 2) {
-      const v64 k1 = v64_load_aligned(org);
-      const v64 k2 = v64_load_aligned(org + ostride);
-      const v64 l1 = v64_load_aligned(rec);
-      const v64 l2 = v64_load_aligned(rec + rstride);
-      v128 o = v128_from_v64(k1, k2);
-      const v128 q = v128_from_v64(l1, l2);
-      const v128 x = v128_add_8(c128, q);
-      v128 a = v128_add_8(
-          c128,
-          v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
-      v128 b = v128_add_8(c128,
-                          v128_from_v64(v64_load_unaligned(rec - 2),
-                                        v64_load_unaligned(rec - 2 + rstride)));
-      v128 c = v128_add_8(c128,
-                          v128_from_v64(v64_load_unaligned(rec - 1),
-                                        v64_load_unaligned(rec - 1 + rstride)));
-      v128 d = v128_add_8(c128,
-                          v128_from_v64(v64_load_unaligned(rec + 1),
-                                        v64_load_unaligned(rec + 1 + rstride)));
-      v128 e = v128_add_8(c128,
-                          v128_from_v64(v64_load_unaligned(rec + 2),
-                                        v64_load_unaligned(rec + 2 + rstride)));
-      v128 f = v128_add_8(
-          c128, v128_from_v64(
-                    l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
-      v128 tmp, delta1, delta2, delta3;
-
-      a = v128_ssub_s8(a, x);
-      b = v128_ssub_s8(b, x);
-      c = v128_ssub_s8(c, x);
-      d = v128_ssub_s8(d, x);
-      e = v128_ssub_s8(e, x);
-      f = v128_ssub_s8(f, x);
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
-                       v128_max_s8(v128_min_s8(d, cp1), cm1));
-      delta1 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
-                                    v128_max_s8(v128_min_s8(f, cp1), cm1)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
-                         v128_max_s8(v128_min_s8(e, cp1), cm1))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
-                       v128_max_s8(v128_min_s8(d, cp2), cm2));
-      delta2 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
-                                    v128_max_s8(v128_min_s8(f, cp2), cm2)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
-                         v128_max_s8(v128_min_s8(e, cp2), cm2))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-      tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
-                       v128_max_s8(v128_min_s8(d, cp4), cm4));
-      delta3 = v128_add_8(
-          v128_add_8(
-              v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
-                                    v128_max_s8(v128_min_s8(f, cp4), cm4)),
-                         2),
-              v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
-                         v128_max_s8(v128_min_s8(e, cp4), cm4))),
-          v128_add_8(v128_add_8(tmp, tmp), tmp));
-
-      ssd0 = v128_ssd_u8(ssd0, o, q);
-      ssd1 = v128_ssd_u8(
-          ssd1, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
-                                                        delta1, v128_zero()))),
-                  4)));
-      ssd2 = v128_ssd_u8(
-          ssd2, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
-                                                        delta2, v128_zero()))),
-                  4)));
-      ssd3 = v128_ssd_u8(
-          ssd3, o,
-          v128_add_8(
-              q,
-              v128_shr_s8(
-                  v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
-                                                        delta3, v128_zero()))),
-                  4)));
-      rec += 2 * rstride;
-      org += 2 * ostride;
-    }
+  for (y = 0; y < 8; y += 2) {
+    v128 a, b, c, d, e, f, o, r;
+    read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
+                   &a, &b, &c, &d, &e, &f);
+    ssd0 = v128_ssd_u8(ssd0, o, r);
+    calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
+    rec += 2 * rstride;
+    org += 2 * ostride;
   }
   sum[0] += v128_ssd_u8_sum(ssd0);
   sum[1] += v128_ssd_u8_sum(ssd1);
   sum[2] += v128_ssd_u8_sum(ssd2);
   sum[3] += v128_ssd_u8_sum(ssd3);
 }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
+                                    int rstride, int ostride, int x0, int y0,
+                                    int bottom, int right, int y, v128 *o,
+                                    v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
+                                    v128 *e, v128 *f, int shift) {
+  const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
+  const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
+  *o = v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
+                      v128_shr_u16(v128_load_aligned(org + ostride), shift));
+  *r = v128_unziplo_8(n1, n2);
+  *a = v128_unziplo_8(
+      v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift), n1);
+  *f = v128_unziplo_8(
+      n2, v128_shr_u16(v128_load_unaligned(rec + ((y != bottom) + 1) * rstride),
+                       shift));
+  *b = v128_unziplo_8(
+      v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
+      v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
+  *c = v128_unziplo_8(
+      v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
+      v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
+  *d = v128_unziplo_8(
+      v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
+      v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
+  *e = v128_unziplo_8(
+      v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
+      v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
+  clip_sides(b, c, d, e, x0, right);
+}
+
+void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
+                                    int rstride, int ostride, int x0, int y0,
+                                    int width, int height, int *sum0, int *sum1,
+                                    unsigned int strength, int shift,
+                                    int size) {
+  const v128 sp = v128_dup_8(strength >> shift);
+  const v128 sm = v128_dup_8(-(int)(strength >> shift));
+  const int bottom = height - 2 - y0;
+  const int right = width - 8 - x0;
+  ssd128_internal ssd0 = v128_ssd_u8_init();
+  ssd128_internal ssd1 = v128_ssd_u8_init();
+  int y;
+
+  if (size != 8) {  // Fallback to plain C
+    aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
+                          sum0, sum1, strength, shift, size);
+    return;
+  }
+
+  rec += x0 + y0 * rstride;
+  org += x0 + y0 * ostride;
+
+  for (y = 0; y < 8; y += 2) {
+    v128 a, b, c, d, e, f, o, r;
+    read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
+                       &r, &a, &b, &c, &d, &e, &f, shift);
+    ssd0 = v128_ssd_u8(ssd0, o, r);
+    ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
+    rec += rstride * 2;
+    org += ostride * 2;
+  }
+  *sum0 += v128_ssd_u8_sum(ssd0);
+  *sum1 += v128_ssd_u8_sum(ssd1);
+}
+
+void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
+                                          const uint16_t *org, int rstride,
+                                          int ostride, int x0, int y0,
+                                          int width, int height, int *sum,
+                                          int shift, int size) {
+  const int bottom = height - 2 - y0;
+  const int right = width - 8 - x0;
+  ssd128_internal ssd0 = v128_ssd_u8_init();
+  ssd128_internal ssd1 = v128_ssd_u8_init();
+  ssd128_internal ssd2 = v128_ssd_u8_init();
+  ssd128_internal ssd3 = v128_ssd_u8_init();
+  int y;
+
+  if (size != 8) {  // Fallback to plain C
+    aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
+                                height, sum, shift, size);
+    return;
+  }
+
+  rec += x0 + y0 * rstride;
+  org += x0 + y0 * ostride;
+
+  for (y = 0; y < 8; y += 2) {
+    v128 a, b, c, d, e, f, o, r;
+    read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
+                       &r, &a, &b, &c, &d, &e, &f, shift);
+    ssd0 = v128_ssd_u8(ssd0, o, r);
+    calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
+    rec += 2 * rstride;
+    org += 2 * ostride;
+  }
+  sum[0] += v128_ssd_u8_sum(ssd0);
+  sum[1] += v128_ssd_u8_sum(ssd1);
+  sum[2] += v128_ssd_u8_sum(ssd2);
+  sum[3] += v128_ssd_u8_sum(ssd3);
+}
+#endif
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 87e7d51..2eecee4 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -815,7 +815,9 @@
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+#if CONFIG_DUAL_FILTER
   int i;
+#endif
   const uint8_t *src;
   const uint8_t *ref;
   int src_stride;
@@ -859,7 +861,6 @@
 
   if (!is_key_frame) {
     MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-    unsigned int uv_sad;
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
     const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     unsigned int y_sad, y_sad_g;
@@ -916,20 +917,6 @@
 
     av1_build_inter_predictors_sb(xd, mi_row, mi_col, cm->sb_size);
 
-    for (i = 1; i < MAX_MB_PLANE; ++i) {
-      struct macroblock_plane *p = &x->plane[i];
-      struct macroblockd_plane *pd = &xd->plane[i];
-      const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-
-      if (bs == BLOCK_INVALID)
-        uv_sad = UINT_MAX;
-      else
-        uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
-                                     pd->dst.stride);
-
-      x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
-    }
-
     ref = xd->plane[0].dst.buf;
     ref_stride = xd->plane[0].dst.stride;
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 69a7cf8..983f8cc 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -16,6 +16,7 @@
 
 #include "av1/common/alloccommon.h"
 #if CONFIG_CLPF
+#include "aom/aom_image.h"
 #include "av1/common/clpf.h"
 #include "av1/encoder/clpf_rdo.h"
 #endif
@@ -3423,7 +3424,7 @@
 #endif
   }
 #if CONFIG_CLPF
-  cm->clpf_strength = 0;
+  cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
   cm->clpf_size = 2;
   CHECK_MEM_ERROR(
       cm, cm->clpf_blocks,
@@ -3431,35 +3432,33 @@
                      ((cm->frame_to_show->y_crop_height + 31) & ~31) >>
                  10));
   if (!is_lossless_requested(&cpi->oxcf)) {
-    // Test CLPF
-    int i, hq = 1;
-    // TODO(yaowu): investigate per-segment CLPF decision and
-    // an optimal threshold, use 80 for now.
-    for (i = 0; i < MAX_SEGMENTS; i++)
-      hq &= av1_get_qindex(&cm->seg, i, cm->base_qindex) < 80;
+    const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
 
-    // Don't try filter if the entire image is nearly losslessly encoded
-    if (!hq) {
-      // Find the best strength and block size for the entire frame
-      int fb_size_log2, strength;
-      av1_clpf_test_frame(&cpi->last_frame_uf, cpi->Source, cm, &strength,
-                          &fb_size_log2);
+    // Find the best strength and block size for the entire frame
+    int fb_size_log2, strength_y, strength_u, strength_v;
+    av1_clpf_test_frame(frame, cpi->Source, cm, &strength_y, &fb_size_log2,
+                        AOM_PLANE_Y);
+    av1_clpf_test_frame(frame, cpi->Source, cm, &strength_u, 0, AOM_PLANE_U);
+    av1_clpf_test_frame(frame, cpi->Source, cm, &strength_v, 0, AOM_PLANE_V);
 
-      if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE);
-
-      if (!strength) {  // Better to disable for the whole frame?
-        cm->clpf_strength = 0;
-      } else {
-        // Apply the filter using the chosen strength
-        cm->clpf_strength = strength - (strength == 4);
-        cm->clpf_size =
-            fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
-        aom_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf);
-        cm->clpf_numblocks =
-            av1_clpf_frame(cm->frame_to_show, &cpi->last_frame_uf, cpi->Source,
-                           cm, !!cm->clpf_size, strength, 4 + cm->clpf_size,
-                           cm->clpf_blocks, av1_clpf_decision);
-      }
+    if (strength_y) {
+      // Apply the filter using the chosen strength
+      cm->clpf_strength_y = strength_y - (strength_y == 4);
+      cm->clpf_size =
+          fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
+      cm->clpf_numblocks = av1_clpf_frame(
+          frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
+          4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
+    }
+    if (strength_u) {
+      cm->clpf_strength_u = strength_u - (strength_u == 4);
+      av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
+                     NULL);
+    }
+    if (strength_v) {
+      cm->clpf_strength_v = strength_v - (strength_v == 4);
+      av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
+                     NULL);
     }
   }
 #endif
diff --git a/av1/encoder/treewriter.h b/av1/encoder/treewriter.h
index 533e7d9..9a4cb86 100644
--- a/av1/encoder/treewriter.h
+++ b/av1/encoder/treewriter.h
@@ -29,20 +29,10 @@
 
 void av1_tokens_from_tree(struct av1_token *, const aom_tree_index *);
 
-static INLINE void av1_write_tree(aom_writer *w, const aom_tree_index *tree,
-                                  const aom_prob *probs, int bits, int len,
-                                  aom_tree_index i) {
-  do {
-    const int bit = (bits >> --len) & 1;
-    aom_write(w, bit, probs[i >> 1]);
-    i = tree[i + bit];
-  } while (len);
-}
-
 static INLINE void av1_write_token(aom_writer *w, const aom_tree_index *tree,
                                    const aom_prob *probs,
                                    const struct av1_token *token) {
-  av1_write_tree(w, tree, probs, token->value, token->len, 0);
+  aom_write_tree(w, tree, probs, token->value, token->len, 0);
 }
 
 #ifdef __cplusplus
diff --git a/configure b/configure
index c96691ba..1bc0863 100755
--- a/configure
+++ b/configure
@@ -606,12 +606,7 @@
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused-variable
         case ${CC} in
-          *clang*)
-              # libaom and/or clang have issues with aliasing:
-              # https://code.google.com/p/webm/issues/detail?id=603
-              # work around them until they are fixed
-              check_add_cflags -fno-strict-aliasing
-          ;;
+          *clang*) ;;
           *) check_add_cflags -Wunused-but-set-variable ;;
         esac
         if enabled mips || [ -z "${INLINE}" ]; then
diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index 7676226..069e35e 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <stdio.h>
diff --git a/test/clpf_test.cc b/test/clpf_test.cc
index 786180b..24d7bb3 100644
--- a/test/clpf_test.cc
+++ b/test/clpf_test.cc
@@ -26,9 +26,9 @@
 
 namespace {
 
-typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride,
-                             int x0, int y0, int sizex, int sizey, int width,
-                             int height, unsigned int strength);
+typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
+                             int dstride, int x0, int y0, int sizex, int sizey,
+                             int width, int height, unsigned int strength);
 
 typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
     clpf_block_param_t;
@@ -54,44 +54,84 @@
 
 typedef ClpfBlockTest ClpfSpeedTest;
 
-TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
-  int w = sizex;
-  int h = sizey;
-  const int size = 32;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t, s[size * size]);
-  DECLARE_ALIGNED(16, uint8_t, d[size * size]);
-  DECLARE_ALIGNED(16, uint8_t, ref_d[size * size]);
-  memset(ref_d, 0, size * size);
-  memset(d, 0, size * size);
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*clpf_block_hbd_t)(const uint16_t *src, uint16_t *dst,
+                                 int sstride, int dstride, int x0, int y0,
+                                 int sizex, int sizey, int width, int height,
+                                 unsigned int strength);
 
-  int error = 0;
-  int pos = 0;
-  int strength = 0;
-  int xpos = 0, ypos = 0;
-  int bits;
-  int level;
+typedef std::tr1::tuple<clpf_block_hbd_t, clpf_block_hbd_t, int, int>
+    clpf_block_hbd_param_t;
+
+class ClpfBlockHbdTest
+    : public ::testing::TestWithParam<clpf_block_hbd_param_t> {
+ public:
+  virtual ~ClpfBlockHbdTest() {}
+  virtual void SetUp() {
+    clpf = GET_PARAM(0);
+    ref_clpf = GET_PARAM(1);
+    sizex = GET_PARAM(2);
+    sizey = GET_PARAM(3);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  int sizex;
+  int sizey;
+  clpf_block_hbd_t clpf;
+  clpf_block_hbd_t ref_clpf;
+};
+
+typedef ClpfBlockHbdTest ClpfHbdSpeedTest;
+#endif
+
+template <typename pixel>
+void test_clpf(int w, int h, int depth, int iterations,
+               void (*clpf)(const pixel *src, pixel *dst, int sstride,
+                            int dstride, int x0, int y0, int sizex, int sizey,
+                            int width, int height, unsigned int strength),
+               void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
+                                int dstride, int x0, int y0, int sizex,
+                                int sizey, int width, int height,
+                                unsigned int strength)) {
+  const int size = 24;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, pixel, s[size * size]);
+  DECLARE_ALIGNED(16, pixel, d[size * size]);
+  DECLARE_ALIGNED(16, pixel, ref_d[size * size]);
+  memset(ref_d, 0, size * size * sizeof(*ref_d));
+  memset(d, 0, size * size * sizeof(*d));
+
+  int error = 0, pos = 0, strength = 0, xpos = 0, ypos = 0;
+  int bits, level, count;
 
   // Test every combination of:
-  // * Input with 1-8 bits of noise
-  // * Noise level around every value from 0 to 255
+  // * Input with up to <depth> bits of noise
+  // * Noise level around every value from 0 to (1<<depth)-1
   // * Blocks anywhere in the frame (along all egdes and also fully inside)
   // * All strengths
-  for (level = 0; level < 256 && !error; level++) {
-    for (bits = 1; bits < 9 && !error; bits++) {
-      for (int i = 0; i < size * size; i++)
-        s[i] = clamp((rnd.Rand8() & ((1 << bits) - 1)) + level, 0, 255);
+  // If clpf and ref_clpf are the same, we're just testing speed
+  for (count = 0; count < iterations; count++) {
+    for (level = 0; level < (1 << depth) && !error; level++) {
+      for (bits = 1; bits <= depth && !error; bits++) {
+        for (int i = 0; i < size * size; i++)
+          s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                       (1 << depth) - 1);
 
-      for (ypos = 0; ypos < size && !error; ypos += h * !error) {
-        for (xpos = 0; xpos < size && !error; xpos += w * !error) {
-          for (strength = 0; strength < 3 && !error; strength += !error) {
-            ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size,
-                     1 << strength);
-            ASM_REGISTER_STATE_CHECK(
-                clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength));
-
-            for (pos = 0; pos < size * size && !error; pos++) {
-              error = ref_d[pos] != d[pos];
+        for (ypos = 0; ypos < size && !error; ypos += h * !error) {
+          for (xpos = 0; xpos < size && !error; xpos += w * !error) {
+            for (strength = depth - 8; strength < depth - 5 && !error;
+                 strength += !error) {
+              ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
+                       1 << strength);
+              if (clpf != ref_clpf)
+                ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w,
+                                              h, size, size, 1 << strength));
+              if (ref_clpf != clpf)
+                for (pos = 0; pos < size * size && !error; pos++) {
+                  error = ref_d[pos] != d[pos];
+                }
             }
           }
         }
@@ -99,6 +139,7 @@
     }
   }
 
+  pos--;
   EXPECT_EQ(0, error)
       << "Error: ClpfBlockTest, SIMD and C mismatch." << std::endl
       << "First error at " << pos % size << "," << pos / size << " ("
@@ -106,6 +147,8 @@
       << "strength: " << (1 << strength) << std::endl
       << "xpos: " << xpos << std::endl
       << "ypos: " << ypos << std::endl
+      << "w: " << w << std::endl
+      << "h: " << h << std::endl
       << "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
       << "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
       << "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl
@@ -116,45 +159,26 @@
       << std::endl;
 }
 
-TEST_P(ClpfSpeedTest, TestSpeed) {
-  int w = sizex;
-  int h = sizey;
-  const int size = 32;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t, s[size * size]);
-  DECLARE_ALIGNED(16, uint8_t, d[size * size]);
-
-  int strength;
-  int xpos, ypos;
-
-  for (int i = 0; i < size * size; i++) s[i] = rnd.Rand8();
-
+template <typename pixel>
+void test_clpf_speed(int w, int h, int depth, int iterations,
+                     void (*clpf)(const pixel *src, pixel *dst, int sstride,
+                                  int dstride, int x0, int y0, int sizex,
+                                  int sizey, int width, int height,
+                                  unsigned int strength),
+                     void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
+                                      int dstride, int x0, int y0, int sizex,
+                                      int sizey, int width, int height,
+                                      unsigned int strength)) {
   aom_usec_timer ref_timer;
   aom_usec_timer timer;
 
   aom_usec_timer_start(&ref_timer);
-  for (int c = 0; c < 65536; c++) {
-    for (ypos = 0; ypos < size; ypos += h) {
-      for (xpos = 0; xpos < size; xpos += w) {
-        for (strength = 0; strength < 3; strength++) {
-          ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
-        }
-      }
-    }
-  }
+  test_clpf(w, h, depth, iterations, ref_clpf, ref_clpf);
   aom_usec_timer_mark(&ref_timer);
   int ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer);
 
   aom_usec_timer_start(&timer);
-  for (int c = 0; c < 65536; c++) {
-    for (ypos = 0; ypos < size; ypos += h) {
-      for (xpos = 0; xpos < size; xpos += w) {
-        for (strength = 0; strength < 3; strength++) {
-          clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
-        }
-      }
-    }
-  }
+  test_clpf(w, h, depth, iterations, clpf, clpf);
   aom_usec_timer_mark(&timer);
   int elapsed_time = aom_usec_timer_elapsed(&timer);
 
@@ -165,10 +189,28 @@
 
   EXPECT_GT(ref_elapsed_time, elapsed_time)
       << "Error: ClpfSpeedTest, SIMD slower than C." << std::endl
-      << "C time: " << ref_elapsed_time << "ms" << std::endl
-      << "SIMD time: " << elapsed_time << "ms" << std::endl;
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
 }
 
+TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
+  test_clpf(sizex, sizey, 8, 1, clpf, ref_clpf);
+}
+
+TEST_P(ClpfSpeedTest, TestSpeed) {
+  test_clpf_speed(sizex, sizey, 8, 16, clpf, ref_clpf);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+TEST_P(ClpfBlockHbdTest, TestSIMDNoMismatch) {
+  test_clpf(sizex, sizey, 12, 1, clpf, ref_clpf);
+}
+
+TEST_P(ClpfHbdSpeedTest, TestSpeed) {
+  test_clpf_speed(sizex, sizey, 12, 1, clpf, ref_clpf);
+}
+#endif
+
 using std::tr1::make_tuple;
 
 // Test all supported architectures and block sizes
@@ -212,6 +254,48 @@
                                  4)));
 #endif
 
+#if CONFIG_AOM_HIGHBITDEPTH
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, ClpfBlockHbdTest,
+    ::testing::Values(
+        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 8),
+        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 4),
+        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 8),
+        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 4)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, ClpfBlockHbdTest,
+    ::testing::Values(
+        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 8),
+        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 4),
+        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 8),
+        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 4)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSSE4_1, ClpfBlockHbdTest,
+    ::testing::Values(
+        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 8),
+        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 4),
+        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 8),
+        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 4)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, ClpfBlockHbdTest,
+    ::testing::Values(
+        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 8),
+        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 4),
+        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 8),
+        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 4)));
+#endif
+#endif
+
 // Test speed for all supported architectures
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, ClpfSpeedTest,
@@ -236,4 +320,35 @@
                         ::testing::Values(make_tuple(&aom_clpf_block_neon,
                                                      &aom_clpf_block_c, 8, 8)));
 #endif
+
+#if CONFIG_AOM_HIGHBITDEPTH
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, ClpfHbdSpeedTest,
+                        ::testing::Values(make_tuple(&aom_clpf_block_hbd_sse2,
+                                                     &aom_clpf_block_hbd_c, 8,
+                                                     8)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, ClpfHbdSpeedTest,
+                        ::testing::Values(make_tuple(&aom_clpf_block_hbd_ssse3,
+                                                     &aom_clpf_block_hbd_c, 8,
+                                                     8)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSSE4_1, ClpfHbdSpeedTest,
+                        ::testing::Values(make_tuple(&aom_clpf_block_hbd_ssse3,
+                                                     &aom_clpf_block_hbd_c, 8,
+                                                     8)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, ClpfHbdSpeedTest,
+                        ::testing::Values(make_tuple(&aom_clpf_block_hbd_neon,
+                                                     &aom_clpf_block_hbd_c, 8,
+                                                     8)));
+#endif
+#endif
+
 }  // namespace
diff --git a/test/cx_set_ref.sh b/test/cx_set_ref.sh
deleted file mode 100755
index dfba40d..0000000
--- a/test/cx_set_ref.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-##  This file tests the libaom cx_set_ref example. To add new tests to this
-##  file, do the following:
-##    1. Write a shell function (this is your test).
-##    2. Add the function to cx_set_ref_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: $YUV_RAW_INPUT is required.
-cx_set_ref_verify_environment() {
-  if [ ! -e "${YUV_RAW_INPUT}" ]; then
-    echo "Libaom test data must exist in LIBVPX_TEST_DATA_PATH."
-    return 1
-  fi
-}
-
-# Runs cx_set_ref and updates the reference frame before encoding frame 90.
-# $1 is the codec name.
-aom_set_ref() {
-  local codec="$1"
-  local encoder="${LIBAOM_BIN_PATH}/aom_cx_set_ref${AOM_TEST_EXE_SUFFIX}"
-
-  local output_file="${AOM_TEST_OUTPUT_DIR}/${codec}cx_set_ref_${codec}.ivf"
-  local ref_frame_num=90
-
-  if [ ! -x "${encoder}" ]; then
-    elog "${encoder} does not exist or is not executable."
-    return 1
-  fi
-
-  if [ "$codec" = "vp8" ]; then
-    eval "${AOM_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT_WIDTH}" \
-        "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
-        "${ref_frame_num}" ${devnull}
-  else
-    eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-        "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
-        "${ref_frame_num}" ${devnull}
-  fi
-
-  [ -e "${output_file}" ] || return 1
-}
-
-cx_set_ref_av1() {
-  if [ "$(av1_encode_available)" = "yes" ]; then
-    aom_set_ref av1 || return 1
-  fi
-}
-
-cx_set_ref_tests="cx_set_ref_av1"
-
-run_tests cx_set_ref_verify_environment "${cx_set_ref_tests}"
diff --git a/test/vpxdec.sh b/test/vpxdec.sh
deleted file mode 100755
index 7c5169d..0000000
--- a/test/vpxdec.sh
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-##  This file tests aomdec. To add new tests to this file, do the following:
-##    1. Write a shell function (this is your test).
-##    2. Add the function to aomdec_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: Make sure input is available.
-aomdec_verify_environment() {
-  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${AV1_WEBM_FILE}" ] || \
-    [ ! -e "${AV1_FPM_WEBM_FILE}" ] || \
-    [ ! -e "${AV1_LT_50_FRAMES_WEBM_FILE}" ] ; then
-    elog "Libaom test data must exist in LIBVPX_TEST_DATA_PATH."
-    return 1
-  fi
-  if [ -z "$(aom_tool_path aomdec)" ]; then
-    elog "aomdec not found. It must exist in LIBAOM_BIN_PATH or its parent."
-    return 1
-  fi
-}
-
-# Wrapper function for running aomdec with pipe input. Requires that
-# LIBAOM_BIN_PATH points to the directory containing aomdec. $1 is used as the
-# input file path and shifted away. All remaining parameters are passed through
-# to aomdec.
-aomdec_pipe() {
-  local readonly decoder="$(aom_tool_path aomdec)"
-  local readonly input="$1"
-  shift
-  cat "${input}" | eval "${AOM_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
-}
-
-# Wrapper function for running aomdec. Requires that LIBAOM_BIN_PATH points to
-# the directory containing aomdec. $1 one is used as the input file path and
-# shifted away. All remaining parameters are passed through to aomdec.
-aomdec() {
-  local readonly decoder="$(aom_tool_path aomdec)"
-  local readonly input="$1"
-  shift
-  eval "${AOM_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
-}
-
-aomdec_can_decode_vp8() {
-  if [ "$(vp8_decode_available)" = "yes" ]; then
-    echo yes
-  fi
-}
-
-aomdec_can_decode_vp9() {
-  if [ "$(vp9_decode_available)" = "yes" ]; then
-    echo yes
-  fi
-}
-
-aomdec_vp8_ivf() {
-  if [ "$(aomdec_can_decode_vp8)" = "yes" ]; then
-    aomdec "${VP8_IVF_FILE}" --summary --noblit
-  fi
-}
-
-aomdec_vp8_ivf_pipe_input() {
-  if [ "$(aomdec_can_decode_vp8)" = "yes" ]; then
-    aomdec_pipe "${VP8_IVF_FILE}" --summary --noblit
-  fi
-}
-
-aomdec_vp9_webm() {
-  if [ "$(aomdec_can_decode_vp9)" = "yes" ] && \
-     [ "$(webm_io_available)" = "yes" ]; then
-    aomdec "${AV1_WEBM_FILE}" --summary --noblit
-  fi
-}
-
-aomdec_vp9_webm_frame_parallel() {
-  if [ "$(aomdec_can_decode_vp9)" = "yes" ] && \
-     [ "$(webm_io_available)" = "yes" ]; then
-    for threads in 2 3 4 5 6 7 8; do
-      aomdec "${AV1_FPM_WEBM_FILE}" --summary --noblit --threads=$threads \
-        --frame-parallel
-    done
-  fi
-}
-
-aomdec_vp9_webm_less_than_50_frames() {
-  # ensure that reaching eof in webm_guess_framerate doesn't result in invalid
-  # frames in actual webm_read_frame calls.
-  if [ "$(aomdec_can_decode_vp9)" = "yes" ] && \
-     [ "$(webm_io_available)" = "yes" ]; then
-    local readonly decoder="$(aom_tool_path aomdec)"
-    local readonly expected=10
-    local readonly num_frames=$(${AOM_TEST_PREFIX} "${decoder}" \
-      "${AV1_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
-      | awk '/^[0-9]+ decoded frames/ { print $1 }')
-    if [ "$num_frames" -ne "$expected" ]; then
-      elog "Output frames ($num_frames) != expected ($expected)"
-      return 1
-    fi
-  fi
-}
-
-aomdec_tests="aomdec_vp8_ivf
-              aomdec_vp8_ivf_pipe_input
-              aomdec_vp9_webm
-              aomdec_vp9_webm_frame_parallel
-              aomdec_vp9_webm_less_than_50_frames"
-
-run_tests aomdec_verify_environment "${aomdec_tests}"
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index c996655..c4755f7 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -139,7 +139,7 @@
 
 class Y4mVideoWriteTest : public Y4mVideoSourceTest {
  protected:
-  Y4mVideoWriteTest() {}
+  Y4mVideoWriteTest() : tmpfile_(NULL) {}
 
   virtual ~Y4mVideoWriteTest() {
     delete tmpfile_;