Implement ERP tool in single patch

STATS_CHANGED for some changes outside config flags

Change-Id: Ia41604d01d6dcc95d8c010b2bca5c368ed05fdec
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 9e6b5b1..d1d2b9a 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -107,6 +107,7 @@
 specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_8x16 sse2/;
 specialize qw/aom_dc_left_predictor_8x32 sse2/;
+
 specialize qw/aom_dc_left_predictor_16x4 sse2/;
 specialize qw/aom_dc_left_predictor_16x8 sse2/;
 specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
@@ -195,9 +196,6 @@
 specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x8 ssse3/;
-specialize qw/aom_paeth_predictor_16x16 ssse3/;
-specialize qw/aom_paeth_predictor_16x32 ssse3/;
 specialize qw/aom_paeth_predictor_32x16 ssse3/;
 specialize qw/aom_paeth_predictor_32x32 ssse3/;
 specialize qw/aom_smooth_predictor_4x4 neon ssse3/;
@@ -322,13 +320,13 @@
 specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
 specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
 specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/;
-specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
 specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
 specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/;
+specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
 specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
+specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
 specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
 specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/;
-specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
 specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
 specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/;
 specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
@@ -666,6 +664,7 @@
   specialize qw/aom_sad16x32                msa sse2/;
   specialize qw/aom_sad16x16           neon msa sse2/;
   specialize qw/aom_sad16x8            neon msa sse2/;
+  specialize qw/aom_sad16x4                     sse2/;
   specialize qw/aom_sad8x16            neon msa sse2/;
   specialize qw/aom_sad8x8             neon msa sse2/;
   specialize qw/aom_sad8x4                  msa sse2/;
@@ -673,7 +672,6 @@
   specialize qw/aom_sad4x4             neon msa sse2/;
 
   specialize qw/aom_sad4x16                     sse2/;
-  specialize qw/aom_sad16x4                     sse2/;
   specialize qw/aom_sad8x32                     sse2/;
   specialize qw/aom_sad32x8                     sse2/;
   specialize qw/aom_sad16x64                    sse2/;
@@ -711,6 +709,7 @@
   specialize qw/aom_sad16x32_avg        msa sse2/;
   specialize qw/aom_sad16x16_avg        msa sse2/;
   specialize qw/aom_sad16x8_avg         msa sse2/;
+  specialize qw/aom_sad16x4_avg             sse2/;
   specialize qw/aom_sad8x16_avg         msa sse2/;
   specialize qw/aom_sad8x8_avg          msa sse2/;
   specialize qw/aom_sad8x4_avg          msa sse2/;
@@ -718,7 +717,6 @@
   specialize qw/aom_sad4x4_avg          msa sse2/;
 
   specialize qw/aom_sad4x16_avg             sse2/;
-  specialize qw/aom_sad16x4_avg             sse2/;
   specialize qw/aom_sad8x32_avg             sse2/;
   specialize qw/aom_sad32x8_avg             sse2/;
   specialize qw/aom_sad16x64_avg            sse2/;
@@ -894,9 +892,9 @@
   specialize qw/aom_sad32x16x4d   avx2      msa sse2/;
   specialize qw/aom_sad32x8x4d    avx2          sse2/;
   specialize qw/aom_sad16x64x4d                 sse2/;
-  specialize qw/aom_sad16x32x4d             msa sse2/;
-  specialize qw/aom_sad16x16x4d         neon msa sse2/;
-  specialize qw/aom_sad16x8x4d               msa sse2/;
+  specialize qw/aom_sad16x32x4d           msa sse2/;
+  specialize qw/aom_sad16x8x4d            msa sse2/;
+  specialize qw/aom_sad16x16x4d      neon msa sse2/;
 
   specialize qw/aom_sad8x16x4d              msa sse2/;
   specialize qw/aom_sad8x8x4d               msa sse2/;
diff --git a/aom_dsp/x86/aom_convolve_copy_avx2.c b/aom_dsp/x86/aom_convolve_copy_avx2.c
index 39c6a40..d3ac810 100644
--- a/aom_dsp/x86/aom_convolve_copy_avx2.c
+++ b/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -27,7 +27,6 @@
 void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
   if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
     assert(!(dst_stride % 16));
   }
 
@@ -71,9 +70,9 @@
       src += src_stride;
       s[1] = _mm_loadu_si128((__m128i *)src);
       src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
+      _mm_storeu_si128((__m128i *)dst, s[0]);
       dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
+      _mm_storeu_si128((__m128i *)dst, s[1]);
       dst += dst_stride;
       h -= 2;
     } while (h);
@@ -157,7 +156,6 @@
                                    uint16_t *dst, ptrdiff_t dst_stride, int w,
                                    int h) {
   if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
     assert(!(dst_stride % 16));
   }
 
@@ -191,9 +189,9 @@
       src += src_stride;
       s[1] = _mm_loadu_si128((__m128i *)src);
       src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
+      _mm_storeu_si128((__m128i *)dst, s[0]);
       dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
+      _mm_storeu_si128((__m128i *)dst, s[1]);
       dst += dst_stride;
       h -= 2;
     } while (h);
diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.c b/aom_dsp/x86/aom_convolve_copy_sse2.c
index f7b468a..9db91c3 100644
--- a/aom_dsp/x86/aom_convolve_copy_sse2.c
+++ b/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -35,7 +35,6 @@
 void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
   if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
     assert(!(dst_stride % 16));
   }
 
@@ -79,9 +78,9 @@
       src += src_stride;
       s[1] = _mm_loadu_si128((__m128i *)src);
       src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
+      _mm_storeu_si128((__m128i *)dst, s[0]);
       dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
+      _mm_storeu_si128((__m128i *)dst, s[1]);
       dst += dst_stride;
       h -= 2;
     } while (h);
@@ -200,7 +199,6 @@
                                    uint16_t *dst, ptrdiff_t dst_stride, int w,
                                    int h) {
   if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
     assert(!(dst_stride % 16));
   }
 
@@ -236,9 +234,9 @@
       src += src_stride;
       s[1] = _mm_loadu_si128((__m128i *)src);
       src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
+      _mm_storeu_si128((__m128i *)dst, s[0]);
       dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
+      _mm_storeu_si128((__m128i *)dst, s[1]);
       dst += dst_stride;
       h -= 2;
     } while (h);
@@ -251,11 +249,11 @@
       s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
       s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
       src += src_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
-      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_storeu_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_storeu_si128((__m128i *)(dst + 1 * 8), s[1]);
       dst += dst_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
-      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+      _mm_storeu_si128((__m128i *)(dst + 0 * 8), s[2]);
+      _mm_storeu_si128((__m128i *)(dst + 1 * 8), s[3]);
       dst += dst_stride;
       h -= 2;
     } while (h);
@@ -272,15 +270,15 @@
       s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
       s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
       src += src_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
-      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
-      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
-      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+      _mm_storeu_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_storeu_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_storeu_si128((__m128i *)(dst + 2 * 8), s[2]);
+      _mm_storeu_si128((__m128i *)(dst + 3 * 8), s[3]);
       dst += dst_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
-      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
-      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
-      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+      _mm_storeu_si128((__m128i *)(dst + 0 * 8), s[4]);
+      _mm_storeu_si128((__m128i *)(dst + 1 * 8), s[5]);
+      _mm_storeu_si128((__m128i *)(dst + 2 * 8), s[6]);
+      _mm_storeu_si128((__m128i *)(dst + 3 * 8), s[7]);
       dst += dst_stride;
       h -= 2;
     } while (h);
diff --git a/aom_dsp/x86/highbd_intrapred_asm_sse2.asm b/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
index 91b3d12..a7aa55e 100644
--- a/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
+++ b/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
@@ -65,15 +65,15 @@
   psrlw                 m0, 4
   pshuflw               m0, m0, 0x0
   punpcklqdq            m0, m0
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
+  movu   [dstq           ], m0
+  movu   [dstq+strideq*2 ], m0
+  movu   [dstq+strideq*4 ], m0
+  movu   [dstq+stride3q*2], m0
   lea                 dstq, [dstq+strideq*8]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
+  movu   [dstq           ], m0
+  movu   [dstq+strideq*2 ], m0
+  movu   [dstq+strideq*4 ], m0
+  movu   [dstq+stride3q*2], m0
 
   RESTORE_GOT
   RET
@@ -195,15 +195,15 @@
   mova                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3
   lea             stride3q, [strideq*3]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
+  movu   [dstq           ], m0
+  movu   [dstq+strideq*2 ], m0
+  movu   [dstq+strideq*4 ], m0
+  movu   [dstq+stride3q*2], m0
   lea                 dstq, [dstq+strideq*8]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
+  movu   [dstq           ], m0
+  movu   [dstq+strideq*2 ], m0
+  movu   [dstq+strideq*4 ], m0
+  movu   [dstq+stride3q*2], m0
   RET
 
 INIT_XMM sse2
diff --git a/aom_dsp/x86/highbd_intrapred_sse2.c b/aom_dsp/x86/highbd_intrapred_sse2.c
index 5a55736..e577190 100644
--- a/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -54,13 +54,13 @@
   const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
   (void)above;
   (void)bd;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
 }
 
 void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
@@ -77,21 +77,21 @@
   const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
   (void)above;
   (void)bd;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
   dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+  _mm_storeu_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
 }
 
 void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
@@ -357,7 +357,7 @@
   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
   int i;
   for (i = 0; i < height; ++i, dst += stride) {
-    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_storeu_si128((__m128i *)dst, dc_dup);
   }
 }
 
@@ -707,10 +707,10 @@
   (void)left;
   (void)bd;
   const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
-  _mm_store_si128((__m128i *)dst, above_u16);
-  _mm_store_si128((__m128i *)(dst + stride), above_u16);
-  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
-  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+  _mm_storeu_si128((__m128i *)dst, above_u16);
+  _mm_storeu_si128((__m128i *)(dst + stride), above_u16);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride), above_u16);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride), above_u16);
 }
 
 void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
@@ -721,10 +721,10 @@
   const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
   int i;
   for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, above_u16);
-    _mm_store_si128((__m128i *)(dst + stride), above_u16);
-    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
-    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+    _mm_storeu_si128((__m128i *)dst, above_u16);
+    _mm_storeu_si128((__m128i *)(dst + stride), above_u16);
+    _mm_storeu_si128((__m128i *)(dst + 2 * stride), above_u16);
+    _mm_storeu_si128((__m128i *)(dst + 3 * stride), above_u16);
     dst += stride << 2;
   }
 }
@@ -848,13 +848,13 @@
   sum32 /= 12;
   const __m128i row = _mm_set1_epi16((uint16_t)sum32);
 
-  _mm_store_si128((__m128i *)dst, row);
+  _mm_storeu_si128((__m128i *)dst, row);
   dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
+  _mm_storeu_si128((__m128i *)dst, row);
   dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
+  _mm_storeu_si128((__m128i *)dst, row);
   dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
+  _mm_storeu_si128((__m128i *)dst, row);
 }
 
 void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
@@ -873,13 +873,13 @@
   const __m128i row = _mm_set1_epi16((uint16_t)sum32);
   int i;
   for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
   }
 }
diff --git a/aom_dsp/x86/highbd_sad4d_sse2.asm b/aom_dsp/x86/highbd_sad4d_sse2.asm
index 7ae1ca1..14ea419 100644
--- a/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ b/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -98,7 +98,7 @@
 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
 %macro HIGH_PROCESS_8x2x4 5-6 0
   ; 1st 8 px
-  mova                  m0, [srcq +%2*2]
+  movu                  m0, [srcq +%2*2]
 %if %1 == 1
   movu                  m4, [ref1q+%3*2]
   movu                  m5, [ref2q+%3*2]
@@ -156,7 +156,7 @@
 %endif
 
   ; 2nd 8 px
-  mova                  m0, [srcq +(%4)*2]
+  movu                  m0, [srcq +(%4)*2]
   mova                  m3, m0
   movu                  m2, [ref1q+(%5)*2]
   psubusw               m3, m2
diff --git a/aom_dsp/x86/highbd_sad_sse2.asm b/aom_dsp/x86/highbd_sad_sse2.asm
index 58f1ac9..a2510d5 100644
--- a/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/aom_dsp/x86/highbd_sad_sse2.asm
@@ -20,20 +20,21 @@
 ; Arg 2: Height
 ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
 ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
-%macro HIGH_SAD_FN 4
+; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
+%macro HIGH_SAD_FN 4-5 7
 %if %4 == 0
 %if %3 == 5
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
 %elif %4 == 1 ; avg
 %if %3 == 5
-cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, %5, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -356,7 +357,7 @@
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
-  HIGH_SAD_FN 8, %1, 7, %2
+  HIGH_SAD_FN 8, %1, 7, %2, 8
 %if %2 == 2  ; skip rows, so divide number of rows by 2
   mov              n_rowsd, %1/8
 %else
@@ -377,22 +378,30 @@
   pavgw                 m4, [second_predq+mmsize*3]
   lea         second_predq, [second_predq+mmsize*4]
 %endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
+  mova                  m7, m1
+  movu                  m5, [srcq]
+  psubusw               m1, m5
+  psubusw               m5, m7
   por                   m1, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+src_strideq*2]
+
+  mova                  m7, m2
+  movu                  m5, [srcq+src_strideq*2]
+  psubusw               m2, m5
+  psubusw               m5, m7
   por                   m2, m5
-  mova                  m5, [srcq+src_strideq*4]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*4]
+
+  mova                  m7, m3
+  movu                  m5, [srcq+src_strideq*4]
+  psubusw               m3, m5
+  psubusw               m5, m7
   por                   m3, m5
-  mova                  m5, [srcq+src_stride3q*2]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_stride3q*2]
+
+  mova                  m7, m4
+  movu                  m5, [srcq+src_stride3q*2]
+  psubusw               m4, m5
+  psubusw               m5, m7
   por                   m4, m5
+
   paddw                 m1, m2
   paddw                 m3, m4
   movhlps               m2, m1
diff --git a/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 5c78933..c0b39ce 100644
--- a/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -338,8 +338,8 @@
   movu                 m1, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*4]
   mova                 m4, m1
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
+  movu                 m2, [dstq]
+  movu                 m3, [dstq+dst_strideq*2]
   pmullw               m1, filter_y_a
   pmullw               m5, filter_y_b
   paddw                m1, filter_rnd
@@ -404,8 +404,8 @@
   movu                 m1, [srcq + src_strideq*2]
   movu                 m4, [srcq + 2]
   movu                 m5, [srcq + src_strideq*2 + 2]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
+  movu                 m2, [dstq]
+  movu                 m3, [dstq + dst_strideq*2]
   pavgw                m0, m4
   pavgw                m1, m5
 %if %2 == 1 ; avg
@@ -476,8 +476,8 @@
   pavgw                m3, m5
   pavgw                m0, m2
   pavgw                m2, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + dst_strideq*2]
+  movu                 m4, [dstq]
+  movu                 m5, [dstq + dst_strideq*2]
 %if %2 == 1 ; avg
   pavgw                m0, [secq]
   add                secq, sec_str
@@ -591,9 +591,9 @@
   paddw                m0, filter_rnd
   psrlw                m4, 4
   paddw                m0, m2
-  mova                 m2, [dstq]
+  movu                 m2, [dstq]
   psrlw                m0, 4
-  mova                 m3, [dstq+dst_strideq*2]
+  movu                 m3, [dstq+dst_strideq*2]
 %if %2 == 1 ; avg
   pavgw                m0, [secq]
   add                secq, sec_str
@@ -682,8 +682,8 @@
   movu                 m1, [srcq+src_strideq*2]
   movu                 m2, [srcq+2]
   movu                 m3, [srcq+src_strideq*2+2]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
+  movu                 m4, [dstq]
+  movu                 m5, [dstq+dst_strideq*2]
   pmullw               m1, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m1, filter_rnd
@@ -817,8 +817,8 @@
   paddw                m3, filter_rnd
   paddw                m2, m4
   paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
+  movu                 m4, [dstq]
+  movu                 m5, [dstq+dst_strideq*2]
   psrlw                m2, 4
   psrlw                m3, 4
   pavgw                m0, m2
@@ -986,11 +986,11 @@
   pmullw               m3, filter_y_b
   paddw                m0, m2
   paddw                m4, filter_rnd
-  mova                 m2, [dstq]
+  movu                 m2, [dstq]
   paddw                m4, m3
   psrlw                m0, 4
   psrlw                m4, 4
-  mova                 m3, [dstq+dst_strideq*2]
+  movu                 m3, [dstq+dst_strideq*2]
 %if %2 == 1 ; avg
   pavgw                m0, [secq]
   add                secq, sec_str
diff --git a/aom_dsp/x86/intrapred_asm_sse2.asm b/aom_dsp/x86/intrapred_asm_sse2.asm
index 0eb6323..e9182b1 100644
--- a/aom_dsp/x86/intrapred_asm_sse2.asm
+++ b/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -226,10 +226,10 @@
   punpcklqdq            m0, m0
   packuswb              m0, m0
 .loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
+  movu    [dstq          ], m0
+  movu    [dstq+strideq  ], m0
+  movu    [dstq+strideq*2], m0
+  movu    [dstq+stride3q ], m0
   lea                 dstq, [dstq+strideq*4]
   dec              lines4d
   jnz .loop
@@ -285,10 +285,10 @@
   punpcklqdq            m0, m0
   packuswb              m0, m0
 .loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
+  movu    [dstq          ], m0
+  movu    [dstq+strideq  ], m0
+  movu    [dstq+strideq*2], m0
+  movu    [dstq+stride3q ], m0
   lea                 dstq, [dstq+strideq*4]
   dec              lines4d
   jnz .loop
@@ -486,10 +486,10 @@
   lea             stride3q, [strideq*3]
   mov              nlines4d, 4
 .loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
+  movu    [dstq          ], m0
+  movu    [dstq+strideq  ], m0
+  movu    [dstq+strideq*2], m0
+  movu    [dstq+stride3q ], m0
   lea                 dstq, [dstq+strideq*4]
   dec             nlines4d
   jnz .loop
@@ -567,12 +567,12 @@
   punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
   pshufd            m1, m0, 0x0             ; l1 repeated 16 times
   pshufd            m2, m0, 0x55            ; l2 repeated 16 times
-  mova    [dstq          ], m1
-  mova    [dstq+strideq  ], m2
+  movu    [dstq          ], m1
+  movu    [dstq+strideq  ], m2
   pshufd            m1, m0, 0xaa
   pshufd            m2, m0, 0xff
-  mova    [dstq+strideq*2], m1
-  mova    [dstq+stride3q ], m2
+  movu    [dstq+strideq*2], m1
+  movu    [dstq+stride3q ], m2
   inc                lineq
   lea                leftq, [leftq+4       ]
   lea                 dstq, [dstq+strideq*4]
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 23c5b2b..a7daefc 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -763,7 +763,7 @@
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm256_add_epi16(rep, one);
   }
@@ -787,7 +787,7 @@
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm256_add_epi16(rep, one);
   }
@@ -806,7 +806,7 @@
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm256_add_epi16(rep, one);
   }
@@ -817,7 +817,7 @@
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm256_add_epi16(rep, one);
   }
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 5afef68..39a1d78 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -36,7 +36,7 @@
                                  ptrdiff_t stride) {
   int i;
   for (i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
+    _mm_storeu_si128((__m128i *)dst, *row);
     dst += stride;
   }
 }
@@ -1171,7 +1171,7 @@
                                      ptrdiff_t stride) {
   int i;
   for (i = 0; i < h; ++i) {
-    _mm_store_si128((__m128i *)dst, row[i]);
+    _mm_storeu_si128((__m128i *)dst, row[i]);
     dst += stride;
   }
 }
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 5a34ea0..1094fdc 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -212,7 +212,7 @@
     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm_add_epi16(rep, one);
   }
@@ -234,7 +234,7 @@
     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm_add_epi16(rep, one);
   }
@@ -257,7 +257,7 @@
     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm_add_epi16(rep, one);
   }
@@ -281,7 +281,7 @@
     l16 = _mm_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm_add_epi16(rep, one);
   }
@@ -292,7 +292,7 @@
     l16 = _mm_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 
-    _mm_store_si128((__m128i *)dst, row);
+    _mm_storeu_si128((__m128i *)dst, row);
     dst += stride;
     rep = _mm_add_epi16(rep, one);
   }
diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
index b5a17c5..82b5125 100644
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ b/aom_dsp/x86/sad4d_sse2.asm
@@ -210,7 +210,7 @@
 %macro PROCESS_16x2x4 9
   ; 1st 16 px
   HANDLE_FIRST_OFFSET   %7, %2
-  mova                  m0, [srcq + first_offset]
+  movu                  m0, [srcq + first_offset]
   HANDLE_SECOND_OFFSET  %7, %8, %4
 %if %1 == 1
   movu                  m4, [ref1q+%3]
@@ -256,7 +256,7 @@
 %endif ; %1 == 1
 
   ; 2nd 16 px
-  mova                  m0, [srcq + second_offset]
+  movu                  m0, [srcq + second_offset]
   movu                  m1, [ref1q+%5]
   movu                  m2, [ref2q+%5]
 
diff --git a/aom_dsp/x86/sad_sse2.asm b/aom_dsp/x86/sad_sse2.asm
index de9845a..86cf70a 100644
--- a/aom_dsp/x86/sad_sse2.asm
+++ b/aom_dsp/x86/sad_sse2.asm
@@ -270,25 +270,33 @@
   pxor                  m0, m0
 
 .loop:
+; Handle the first two rows
   movu                  m1, [refq]
   movu                  m2, [refq+ref_strideq]
-  movu                  m3, [refq+ref_strideq*2]
-  movu                  m4, [refq+ref_stride3q]
+  movu                  m3, [srcq]
+  movu                  m4, [srcq+src_strideq]
 %if %2 == 1
   pavgb                 m1, [second_predq+mmsize*0]
   pavgb                 m2, [second_predq+mmsize*1]
+%endif
+  psadbw                m1, m3
+  psadbw                m2, m4
+; Then the next two rows
+  movu                  m3, [refq+ref_strideq*2]
+  paddd                 m2, m1
+  movu                  m4, [refq+ref_stride3q]
+  paddd                 m0, m2
+  movu                  m1, [srcq+src_strideq*2]
+  movu                  m2, [srcq+src_stride3q]
+%if %2 == 1
   pavgb                 m3, [second_predq+mmsize*2]
   pavgb                 m4, [second_predq+mmsize*3]
   lea         second_predq, [second_predq+mmsize*4]
 %endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+src_strideq]
-  psadbw                m3, [srcq+src_strideq*2]
-  psadbw                m4, [srcq+src_stride3q]
-  paddd                 m1, m2
+  psadbw                m3, m1
+  psadbw                m4, m2
   paddd                 m3, m4
   lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
   lea                 srcq, [srcq+src_strideq*4]
   paddd                 m0, m3
   dec              n_rowsd
diff --git a/aom_dsp/x86/subpel_variance_sse2.asm b/aom_dsp/x86/subpel_variance_sse2.asm
index cbf2890..ce3592e 100644
--- a/aom_dsp/x86/subpel_variance_sse2.asm
+++ b/aom_dsp/x86/subpel_variance_sse2.asm
@@ -210,7 +210,7 @@
 .x_zero_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
 %if %2 == 1 ; avg
   pavgb                m0, [secq]
   punpckhbw            m3, m1, m5
@@ -286,7 +286,7 @@
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
 %if %2 == 1 ; avg
@@ -389,7 +389,7 @@
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
@@ -507,7 +507,7 @@
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
 %if %2 == 1 ; avg
@@ -586,7 +586,7 @@
 .x_half_y_half_loop:
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
   pavgb                m4, m3
   punpckhbw            m3, m1, m5
   pavgb                m0, m4
@@ -716,7 +716,7 @@
 .x_half_y_other_loop:
   movu                 m4, [srcq]
   movu                 m2, [srcq+1]
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
   pavgb                m4, m2
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
@@ -870,7 +870,7 @@
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
@@ -1040,7 +1040,7 @@
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
 %if cpuflag(ssse3)
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
   punpckhbw            m2, m4, m3
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
@@ -1066,7 +1066,7 @@
   paddw                m2, filter_rnd
   paddw                m4, m3
   paddw                m2, m1
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
   psraw                m4, 4
   psraw                m2, 4
   punpckhbw            m3, m1, m5
@@ -1257,7 +1257,7 @@
 %if cpuflag(ssse3)
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
   punpckhbw            m2, m4, m3
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
@@ -1303,7 +1303,7 @@
   pmullw               m0, filter_y_a
   pmullw               m3, filter_y_b
   paddw                m2, m1
-  mova                 m1, [dstq]
+  movu                 m1, [dstq]
   paddw                m0, filter_rnd
   psraw                m2, 4
   paddw                m0, m3
diff --git a/aom_dsp/x86/subtract_sse2.asm b/aom_dsp/x86/subtract_sse2.asm
index af38022..a3fd3d5 100644
--- a/aom_dsp/x86/subtract_sse2.asm
+++ b/aom_dsp/x86/subtract_sse2.asm
@@ -38,10 +38,10 @@
   je .case_64
 
 %macro loop16 6
-  mova                  m0, [srcq+%1]
-  mova                  m4, [srcq+%2]
-  mova                  m1, [predq+%3]
-  mova                  m5, [predq+%4]
+  movu                  m0, [srcq+%1]
+  movu                  m4, [srcq+%2]
+  movu                  m1, [predq+%3]
+  movu                  m5, [predq+%4]
   punpckhbw             m2, m0, m7
   punpckhbw             m3, m1, m7
   punpcklbw             m0, m7
diff --git a/apps/aomenc.c b/apps/aomenc.c
index 88ba906..580ebe0 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -581,7 +581,11 @@
   config->enable_rect_partitions = 1;
   config->enable_1to4_partitions = 1;
   config->disable_ml_transform_speed_features = 0;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  config->disable_ml_partition_speed_features = 1;
+#else
   config->disable_ml_partition_speed_features = 0;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 #if CONFIG_SDP
   config->enable_sdp = 1;
 #endif
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 6400e19..dabd2e5 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -386,11 +386,15 @@
   0,                            // film_grain_table_filename
   0,                            // motion_vector_unit_test
   1,                            // CDF update mode
-  0,                            // disable ML based partition speed up features
-  1,                            // enable rectangular partitions
-  1,                            // enable ab shape partitions
-  1,                            // enable 1:4 and 4:1 partitions
-  0,                            // disable ml based transform speed features
+#if CONFIG_EXT_RECUR_PARTITIONS
+  1,  // disable ML based partition speed up features
+#else
+  0,  // disable ML based partition speed up features
+#endif
+  1,  // enable rectangular partitions
+  1,  // enable ab shape partitions
+  1,  // enable 1:4 and 4:1 partitions
+  0,  // disable ml based transform speed features
 #if CONFIG_SDP
   1,    // enable semi-decoupled partitioning
 #endif  // CONFIG_SDP
@@ -1364,6 +1368,10 @@
 #if CONFIG_SDP
   part_cfg->enable_sdp = extra_cfg->enable_sdp;
 #endif
+#if CONFIG_EXT_RECUR_PARTITIONS
+  part_cfg->disable_ml_partition_speed_features =
+      extra_cfg->disable_ml_partition_speed_features;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   part_cfg->min_partition_size = extra_cfg->min_partition_size;
   part_cfg->max_partition_size = extra_cfg->max_partition_size;
 
@@ -3976,7 +3984,13 @@
     0,                           // use_fixed_qp_offsets
     { -1, -1, -1, -1, -1, -1 },  // fixed_qp_offsets
     {
-        0, 128, 128, 4, 1, 1, 1, 0, 0,
+        0, 128, 128, 4, 1, 1, 1,
+#if CONFIG_EXT_RECUR_PARTITIONS
+        1,
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+        0,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+        0,
 #if CONFIG_SDP
         1,
 #endif  // CONFIG_SDP
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index d11f8f7..f588e7d 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -174,8 +174,24 @@
   above_contexts->num_planes = 0;
 }
 
+static void free_sbi(CommonSBInfoParams *sbi_params) {
+  for (int i = 0; i < sbi_params->sbi_alloc_size; ++i) {
+#if CONFIG_SDP
+    av1_free_ptree_recursive(sbi_params->sbi_grid_base[i].ptree_root[0]);
+    av1_free_ptree_recursive(sbi_params->sbi_grid_base[i].ptree_root[1]);
+#else
+    av1_free_ptree_recursive(sbi_params->sbi_grid_base[i].ptree_root);
+#endif  // CONFIG_SDP
+  }
+
+  aom_free(sbi_params->sbi_grid_base);
+  sbi_params->sbi_grid_base = NULL;
+  sbi_params->sbi_alloc_size = 0;
+}
+
 void av1_free_context_buffers(AV1_COMMON *cm) {
   cm->mi_params.free_mi(&cm->mi_params);
+  free_sbi(&cm->sbi_params);
 
   av1_free_above_context_buffers(&cm->above_contexts);
 
@@ -274,10 +290,49 @@
   return 0;
 }
 
+static void set_sb_si(AV1_COMMON *cm) {
+  CommonSBInfoParams *const sbi_params = &cm->sbi_params;
+  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  sbi_params->sb_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, mib_size_log2) >> mib_size_log2;
+  sbi_params->sb_rows =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, mib_size_log2) >> mib_size_log2;
+  sbi_params->sbi_stride = cm->mi_params.mi_stride >> mib_size_log2;
+}
+
+static int alloc_sbi(CommonSBInfoParams *sbi_params) {
+  const int sbi_size =
+      sbi_params->sbi_stride * calc_mi_size(sbi_params->sb_rows);
+
+  if (sbi_params->sbi_alloc_size < sbi_size) {
+    free_sbi(sbi_params);
+    sbi_params->sbi_grid_base = aom_calloc(sbi_size, sizeof(SB_INFO));
+
+    if (!sbi_params->sbi_grid_base) return 1;
+
+    sbi_params->sbi_alloc_size = sbi_size;
+    for (int i = 0; i < sbi_size; ++i) {
+#if CONFIG_SDP
+      sbi_params->sbi_grid_base[i].ptree_root[0] = NULL;
+      sbi_params->sbi_grid_base[i].ptree_root[1] = NULL;
+#else
+      sbi_params->sbi_grid_base[i].ptree_root = NULL;
+#endif
+    }
+  }
+
+  return 0;
+}
+
 int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
   CommonModeInfoParams *const mi_params = &cm->mi_params;
   mi_params->set_mb_mi(mi_params, width, height);
   if (alloc_mi(mi_params)) goto fail;
+
+  CommonSBInfoParams *const sbi_params = &cm->sbi_params;
+  set_sb_si(cm);
+  if (alloc_sbi(sbi_params)) goto fail;
+
   return 0;
 
 fail:
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 0781160..96364ed 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -641,6 +641,33 @@
   /**@}*/
 };
 
+typedef struct CommonSBInfoParams CommonSBInfoParams;
+/*!
+ * \brief Params related to SB_INFO arrays and related info.
+ */
+struct CommonSBInfoParams {
+  /*!
+   * Grid of pointers to SB_INFO structs.
+   */
+  SB_INFO *sbi_grid_base;
+  /*!
+   * Stride for 'sbi_grid_base'.
+   */
+  int sbi_stride;
+  /*!
+   * Number of superblocks in the vertical direction.
+   */
+  int sb_rows;
+  /*!
+   * Number of superblocks in the horizontal direction.
+   */
+  int sb_cols;
+  /*!
+   * Number of SB_INFO structs that are currently allocated.
+   */
+  int sbi_alloc_size;
+};
+
 typedef struct CommonQuantParams CommonQuantParams;
 /*!
  * \brief Parameters related to quantization at the frame level.
@@ -962,6 +989,11 @@
    */
   CommonModeInfoParams mi_params;
 
+  /*!
+   * Params related to SB_INFO arrays and related info.
+   */
+  CommonSBInfoParams sbi_params;
+
 #if CONFIG_ENTROPY_STATS
   /*!
    * Context type used by token CDFs, in the range 0 .. (TOKEN_CDF_Q_CTXS - 1).
@@ -1382,28 +1414,23 @@
 }
 
 static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       const int num_planes) {
-  int i;
-  int row_offset = mi_row;
-  int col_offset = mi_col;
+                                       const int num_planes,
+                                       const CHROMA_REF_INFO *chr_ref_info) {
 #if CONFIG_SDP
-  for (i = (xd->tree_type == CHROMA_PART); i < num_planes; ++i) {
+  for (int i = (xd->tree_type == CHROMA_PART); i < num_planes; ++i) {
 #else
-  for (i = 0; i < num_planes; ++i) {
+  for (int i = 0; i < num_planes; ++i) {
 #endif
     struct macroblockd_plane *const pd = &xd->plane[i];
     // Offset the buffer pointer
-#if CONFIG_SDP
-    const BLOCK_SIZE bsize = xd->mi[0]->sb_type[xd->tree_type == CHROMA_PART];
-#else
-    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-#endif
-    if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
-      row_offset = mi_row - 1;
-    if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
-      col_offset = mi_col - 1;
-    int above_idx = col_offset;
-    int left_idx = row_offset & MAX_MIB_MASK;
+    const int row_offset =
+        i && chr_ref_info ? chr_ref_info->mi_row_chroma_base : mi_row;
+    const int col_offset =
+        i && chr_ref_info ? chr_ref_info->mi_col_chroma_base : mi_col;
+    assert(row_offset >= 0);
+    assert(col_offset >= 0);
+    const int above_idx = col_offset;
+    const int left_idx = row_offset & MAX_MIB_MASK;
     pd->above_entropy_context =
         &xd->above_entropy_context[i][above_idx >> pd->subsampling_x];
     pd->left_entropy_context =
@@ -1417,15 +1444,26 @@
 }
 
 static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
-                                const int num_planes) {
+                                const int num_planes,
+                                const CHROMA_REF_INFO *chr_ref_info) {
   int i;
 #if CONFIG_SDP
   for (i = (xd->tree_type == CHROMA_PART); i < num_planes; i++) {
 #else
   for (i = 0; i < num_planes; i++) {
 #endif
-    xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
-    xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+    if (chr_ref_info && i > 0) {
+      const BLOCK_SIZE plane_bsize = chr_ref_info->bsize_base;
+      assert(plane_bsize < BLOCK_SIZES_ALL);
+
+      xd->plane[i].width =
+          block_size_wide[plane_bsize] >> xd->plane[i].subsampling_x;
+      xd->plane[i].height =
+          block_size_high[plane_bsize] >> xd->plane[i].subsampling_y;
+    } else {
+      xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
+      xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+    }
 
     xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
     xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
@@ -1434,7 +1472,8 @@
 
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh, int mi_col, int bw,
-                                  int mi_rows, int mi_cols) {
+                                  int mi_rows, int mi_cols,
+                                  const CHROMA_REF_INFO *chr_ref_info) {
   xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
   xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE);
   xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE));
@@ -1445,17 +1484,9 @@
 
   // Are edges available for intra prediction?
   xd->up_available = (mi_row > tile->mi_row_start);
-
-  const int ss_x = xd->plane[1].subsampling_x;
-  const int ss_y = xd->plane[1].subsampling_y;
-
   xd->left_available = (mi_col > tile->mi_col_start);
   xd->chroma_up_available = xd->up_available;
   xd->chroma_left_available = xd->left_available;
-  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
-    xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
-  if (ss_y && bh < mi_size_high[BLOCK_8X8])
-    xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
   if (xd->up_available) {
     xd->above_mbmi = xd->mi[-xd->mi_stride];
   } else {
@@ -1468,28 +1499,38 @@
     xd->left_mbmi = NULL;
   }
 
-  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
-                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
-  xd->is_chroma_ref = chroma_ref;
-  if (chroma_ref) {
-    // To help calculate the "above" and "left" chroma blocks, note that the
-    // current block may cover multiple luma blocks (eg, if partitioned into
-    // 4x4 luma blocks).
-    // First, find the top-left-most luma block covered by this chroma block
-    MB_MODE_INFO **base_mi =
-        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+  if (chr_ref_info) {
+    xd->is_chroma_ref = chr_ref_info->is_chroma_ref;
+    xd->chroma_left_available =
+        chr_ref_info->mi_col_chroma_base > tile->mi_col_start;
+    xd->chroma_up_available =
+        chr_ref_info->mi_row_chroma_base > tile->mi_row_start;
+    if (xd->is_chroma_ref) {
+      // To help calculate the "above" and "left" chroma blocks, note that the
+      // current block may cover multiple luma blocks (eg, if partitioned into
+      // 4x4 luma blocks).
+      // First, find the top-left-most luma block covered by this chroma block
+      const int ss_x = xd->plane[1].subsampling_x;
+      const int ss_y = xd->plane[1].subsampling_y;
+      const int mi_row_offset = mi_row - chr_ref_info->mi_row_chroma_base;
+      const int mi_col_offset = mi_col - chr_ref_info->mi_col_chroma_base;
+      MB_MODE_INFO **base_mi =
+          &xd->mi[-mi_row_offset * xd->mi_stride - mi_col_offset];
 
-    // Then, we consider the luma region covered by the left or above 4x4 chroma
-    // prediction. We want to point to the chroma reference block in that
-    // region, which is the bottom-right-most mi unit.
-    // This leads to the following offsets:
-    MB_MODE_INFO *chroma_above_mi =
-        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
-    xd->chroma_above_mbmi = chroma_above_mi;
+      // Then, we consider the luma region covered by the left or above 4x4
+      // chroma prediction. We want to point to the chroma reference block in
+      // that region, which is the bottom-right-most mi unit. This leads to the
+      // following offsets:
+      MB_MODE_INFO *chroma_above_mi =
+          xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+      xd->chroma_above_mbmi = chroma_above_mi;
 
-    MB_MODE_INFO *chroma_left_mi =
-        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
-    xd->chroma_left_mbmi = chroma_left_mi;
+      MB_MODE_INFO *chroma_left_mi =
+          xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+      xd->chroma_left_mbmi = chroma_left_mi;
+    }
+  } else {
+    xd->is_chroma_ref = 1;
   }
 
   xd->height = bh;
@@ -1497,9 +1538,16 @@
 
   xd->is_last_vertical_rect = 0;
   if (xd->width < xd->height) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    // For PARTITION_VERT_3, it would be (0, 1, 1), because 2nd subpartition has
+    // ratio 1:2, so not enough top-right pixels are available.
+    // For other partitions, it would be (0, 1).
+    if (mi_col & (xd->height - 1)) xd->is_last_vertical_rect = 1;
+#else
     if (!((mi_col + xd->width) & (xd->height - 1))) {
       xd->is_last_vertical_rect = 1;
     }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   }
 
   xd->is_first_horizontal_rect = 0;
@@ -1531,6 +1579,7 @@
   PARTITION_CONTEXT *const left_ctx =
       xd->left_partition_context + (mi_row & MAX_MIB_MASK);
 #endif
+  assert(bsize < BLOCK_SIZES_ALL);
 
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
@@ -1561,10 +1610,14 @@
   out[0] = CDF_PROB_TOP;
   out[0] -= cdf_element_prob(in, PARTITION_HORZ);
   out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_3);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
   if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   out[0] = AOM_ICDF(out[0]);
   out[1] = AOM_ICDF(CDF_PROB_TOP);
 }
@@ -1576,10 +1629,14 @@
   out[0] = CDF_PROB_TOP;
   out[0] -= cdf_element_prob(in, PARTITION_VERT);
   out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_3);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
   if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   out[0] = AOM_ICDF(out[0]);
   out[1] = AOM_ICDF(CDF_PROB_TOP);
 }
@@ -1588,9 +1645,13 @@
                                                 int mi_col, BLOCK_SIZE subsize,
                                                 BLOCK_SIZE bsize,
                                                 PARTITION_TYPE partition) {
-  if (bsize >= BLOCK_8X8) {
+  if (is_partition_point(bsize)) {
     const int hbs = mi_size_wide[bsize] / 2;
-    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+#if CONFIG_EXT_RECUR_PARTITIONS
+    const int quarter_step = hbs / 2;
+#else
+    const BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     switch (partition) {
       case PARTITION_SPLIT:
         if (bsize != BLOCK_8X8) break;
@@ -1598,10 +1659,28 @@
       case PARTITION_NONE:
       case PARTITION_HORZ:
       case PARTITION_VERT:
-      case PARTITION_HORZ_4:
-      case PARTITION_VERT_4:
         update_partition_context(xd, mi_row, mi_col, subsize, bsize);
         break;
+#if CONFIG_EXT_RECUR_PARTITIONS
+      case PARTITION_HORZ_3: {
+        const BLOCK_SIZE bsize3 = get_partition_subsize(bsize, PARTITION_HORZ);
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + quarter_step, mi_col, bsize3,
+                                 bsize3);
+        update_partition_context(xd, mi_row + 3 * quarter_step, mi_col, subsize,
+                                 subsize);
+        break;
+      }
+      case PARTITION_VERT_3: {
+        const BLOCK_SIZE bsize3 = get_partition_subsize(bsize, PARTITION_VERT);
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + quarter_step, bsize3,
+                                 bsize3);
+        update_partition_context(xd, mi_row, mi_col + 3 * quarter_step, subsize,
+                                 subsize);
+        break;
+      }
+#else   // CONFIG_EXT_RECUR_PARTITIONS
       case PARTITION_HORZ_A:
         update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
         update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
@@ -1618,6 +1697,11 @@
         update_partition_context(xd, mi_row, mi_col, subsize, subsize);
         update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
         break;
+      case PARTITION_HORZ_4:
+      case PARTITION_VERT_4:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       default: assert(0 && "Invalid partition type");
     }
   }
@@ -1636,27 +1720,68 @@
   const PARTITION_CONTEXT *left_ctx =
       xd->left_partition_context + (mi_row & MAX_MIB_MASK);
 #endif
-  // Minimum partition point is 8x8. Offset the bsl accordingly.
-  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
-  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (is_square_block(bsize)) {
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    // Minimum partition point is 8x8. Offset the bsl accordingly.
+    const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
+    int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
 
-  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
-  assert(bsl >= 0);
+    assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
+    assert(bsl >= 0);
 
-  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+    return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  } else {
+    const int bsl_w = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
+    const int bsl_h = mi_size_high_log2[bsize] - mi_size_high_log2[BLOCK_8X8];
+
+    const int above = (*above_ctx >> AOMMAX(bsl_w, 0)) & 1;
+    const int left = (*left_ctx >> AOMMAX(bsl_h, 0)) & 1;
+
+    return (left * 2 + above) +
+           AOMMIN(bsl_w + 1, bsl_h + 1) * PARTITION_PLOFFSET;
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 // Return the number of elements in the partition CDF when
 // partitioning the (square) block with luma block size of bsize.
 static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (bsize <= BLOCK_8X8 || bsize == BLOCK_128X128) return PARTITION_TYPES;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
   if (bsize <= BLOCK_8X8)
     return PARTITION_TYPES;
   else if (bsize == BLOCK_128X128)
     return EXT_PARTITION_TYPES - 2;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   else
     return EXT_PARTITION_TYPES;
 }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static INLINE int partition_rec_cdf_length(BLOCK_SIZE bsize) {
+  assert(block_size_wide[bsize] != block_size_high[bsize]);
+
+  switch (bsize) {
+    case BLOCK_4X8:
+    case BLOCK_8X4: return (PARTITION_LONG_SIDE_2_REC + 1);
+    case BLOCK_64X128:
+    case BLOCK_128X64: return (PARTITION_LONG_SIDE_3_REC + 1);
+    case BLOCK_8X16:
+    case BLOCK_16X8:
+    case BLOCK_16X32:
+    case BLOCK_32X16:
+    case BLOCK_32X64:
+    case BLOCK_64X32: return PARTITION_TYPES_REC;
+    default:
+      assert(0 && "Invalid splittable rectangular bsize");
+      return PARTITION_INVALID_REC;
+  }
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                  int plane) {
   assert(bsize < BLOCK_SIZES_ALL);
@@ -2130,30 +2255,52 @@
       // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
       // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
       // half was split.
-      if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
+      if (sshigh * 4 == bhigh) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+        return PARTITION_HORZ_3;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+        return PARTITION_HORZ_4;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      }
+#if !CONFIG_EXT_RECUR_PARTITIONS
       assert(sshigh * 2 == bhigh);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 #if CONFIG_SDP
       if (mbmi_below->sb_type[plane_type] == subsize)
 #else
       if (mbmi_below->sb_type == subsize)
-#endif
+#endif  // CONFIG_SDP
         return PARTITION_HORZ;
+
+#if !CONFIG_EXT_RECUR_PARTITIONS
       else
         return PARTITION_HORZ_B;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
     } else if (sshigh == bhigh) {
       // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
       // PARTITION_VERT_B. To distinguish the latter two, check if the right
       // half was split.
-      if (sswide * 4 == bwide) return PARTITION_VERT_4;
+      if (sswide * 4 == bwide) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+        return PARTITION_VERT_3;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+        return PARTITION_VERT_4;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      }
+#if !CONFIG_EXT_RECUR_PARTITIONS
       assert(sswide * 2 == bhigh);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 #if CONFIG_SDP
       if (mbmi_right->sb_type[plane_type] == subsize)
 #else
       if (mbmi_right->sb_type == subsize)
 #endif
         return PARTITION_VERT;
+
+#if !CONFIG_EXT_RECUR_PARTITIONS
       else
         return PARTITION_VERT_B;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
     } else {
       // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
       // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
@@ -2162,6 +2309,18 @@
       // PARTITION_VERT_A, the right block will have height bhigh; with
       // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
       // it's PARTITION_SPLIT.
+#if CONFIG_EXT_RECUR_PARTITIONS
+      if (sswide * 2 != bwide || sshigh * 2 != bhigh) {
+#if CONFIG_SDP
+        if (mi_size_wide[mbmi_below->sb_type[plane_type]] < bwide &&
+            mi_size_high[mbmi_right->sb_type[plane_type]] < bhigh)
+#else
+        if (mi_size_wide[mbmi_below->sb_type] < bwide &&
+            mi_size_high[mbmi_right->sb_type] < bhigh)
+#endif  // CONFIG_SDP
+          return PARTITION_SPLIT;
+      }
+#else  // CONFIG_EXT_RECUR_PARTITIONS
       if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
 #if CONFIG_SDP
       if (mi_size_wide[mbmi_below->sb_type[plane_type]] == bwide)
@@ -2172,6 +2331,7 @@
       if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
       if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
 #endif
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       return PARTITION_SPLIT;
     }
   }
@@ -2194,6 +2354,22 @@
   seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
 }
 
+static INLINE SB_INFO *av1_get_sb_info(const AV1_COMMON *cm, int mi_row,
+                                       int mi_col) {
+  const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
+  return cm->sbi_params.sbi_grid_base + sb_row * cm->sbi_params.sbi_stride +
+         sb_col;
+}
+
+static INLINE void av1_set_sb_info(AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row,
+                                   int mi_col) {
+  xd->sbi = av1_get_sb_info(cm, mi_row, mi_col);
+
+  xd->sbi->mi_row = mi_row;
+  xd->sbi->mi_col = mi_col;
+}
+
 // Returns true if the frame is fully lossless at the coded resolution.
 // Note: If super-resolution is used, such a frame will still NOT be lossless at
 // the upscaled resolution.
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index 6bbb854..5be219d 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -196,29 +196,36 @@
 }
 
 static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
-#if CONFIG_SDP
-                                  const AV1_COMMON *const cm,
-#endif
                                   const MB_MODE_INFO *const mbmi,
                                   const EDGE_DIR edge_dir, const int mi_row,
                                   const int mi_col, const int plane,
+#if CONFIG_SDP
+                                  const TREE_TYPE tree_type,
+#endif
                                   const struct macroblockd_plane *plane_ptr) {
   assert(mbmi != NULL);
   if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
 #if CONFIG_SDP
-  const int plane_type =
-      (frame_is_intra_only(cm) && plane > 0 && cm->seq_params.enable_sdp);
+  const int plane_type = av1_get_sdp_idx(tree_type);
 #endif
-  TX_SIZE tx_size = (plane == AOM_PLANE_Y)
-                        ? mbmi->tx_size
-#if CONFIG_SDP
-                        : av1_get_max_uv_txsize(mbmi->sb_type[plane_type],
-                                                plane_ptr->subsampling_x,
-                                                plane_ptr->subsampling_y);
+#if CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+  const BLOCK_SIZE bsize_base =
+      get_bsize_base_from_tree_type(mbmi, tree_type, plane);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+
+  TX_SIZE tx_size =
+      (plane == AOM_PLANE_Y)
+          ? mbmi->tx_size
+#if CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+          : av1_get_max_uv_txsize(bsize_base, plane_ptr->subsampling_x,
+                                  plane_ptr->subsampling_y);
+#elif CONFIG_SDP
+          : av1_get_max_uv_txsize(mbmi->sb_type[plane_type],
+                                  plane_ptr->subsampling_x,
+                                  plane_ptr->subsampling_y);
 #else
-                        : av1_get_max_uv_txsize(mbmi->sb_type,
-                                                plane_ptr->subsampling_x,
-                                                plane_ptr->subsampling_y);
+          : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
+                                  plane_ptr->subsampling_y);
 #endif
   assert(tx_size < TX_SIZES_ALL);
 #if CONFIG_SDP
@@ -266,9 +273,15 @@
   params->filter_length = 0;
 
 #if CONFIG_SDP
-  const int plane_type =
-      (frame_is_intra_only(cm) && plane > 0 && cm->seq_params.enable_sdp);
-#endif
+  TREE_TYPE tree_type = SHARED_PART;
+  const bool is_sdp_eligible = frame_is_intra_only(cm) &&
+                               !cm->seq_params.monochrome &&
+                               cm->seq_params.enable_sdp;
+  if (is_sdp_eligible) {
+    tree_type = (plane == AOM_PLANE_Y) ? LUMA_PART : CHROMA_PART;
+  }
+  const int plane_type = is_sdp_eligible && plane > 0;
+#endif  // CONFIG_SDP
 
   // no deblocking is required
   const uint32_t width = plane_ptr->dst.width;
@@ -296,7 +309,7 @@
 
   const TX_SIZE ts =
 #if CONFIG_SDP
-      get_transform_size(xd, cm, mi[0], edge_dir, mi_row, mi_col, plane,
+      get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, tree_type,
                          plane_ptr);
 #else
       get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr);
@@ -316,7 +329,7 @@
           av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
 #if CONFIG_SDP
       const int curr_skipped =
-          mbmi->skip_txfm[plane_type] && is_inter_block(mbmi, xd->tree_type);
+          mbmi->skip_txfm[plane_type] && is_inter_block(mbmi, tree_type);
 #else
       const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
 #endif
@@ -331,7 +344,8 @@
               (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
           const TX_SIZE pv_ts = get_transform_size(
 #if CONFIG_SDP
-              xd, cm, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
+              xd, mi_prev, edge_dir, pv_row, pv_col, plane, tree_type,
+              plane_ptr);
 #else
               xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
 #endif
@@ -342,15 +356,24 @@
           const int pv_skip_txfm =
 #if CONFIG_SDP
               mi_prev->skip_txfm[plane_type] &&
-              is_inter_block(mi_prev, xd->tree_type);
+              is_inter_block(mi_prev, tree_type);
 #else
               mi_prev->skip_txfm && is_inter_block(mi_prev);
 #endif
 #if CONFIG_SDP
-          const BLOCK_SIZE bsize = get_plane_block_size(
-              mbmi->sb_type[plane > 0], plane_ptr->subsampling_x,
+          const BLOCK_SIZE bsize = get_mb_plane_block_size_from_tree_type(
+              mbmi, tree_type, plane, plane_ptr->subsampling_x,
               plane_ptr->subsampling_y);
-#else
+#if !CONFIG_EXT_RECUR_PARTITIONS
+          assert(bsize == get_plane_block_size(mbmi->sb_type[plane_type],
+                                               plane_ptr->subsampling_x,
+                                               plane_ptr->subsampling_y));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+#elif CONFIG_EXT_RECUR_PARTITIONS
+          const BLOCK_SIZE bsize =
+              get_mb_plane_block_size(xd, mbmi, plane, plane_ptr->subsampling_x,
+                                      plane_ptr->subsampling_y);
+#else  // !CONFIG_EXT_RECUR_PARTITIONS && !CONFIG_SDP
           const BLOCK_SIZE bsize =
               get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
                                    plane_ptr->subsampling_y);
@@ -681,8 +704,7 @@
       else if (plane == 2 && !(cm->lf.filter_level_v))
         continue;
 
-      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
-                           plane, plane + 1);
+      av1_setup_dst_planes(pd, frame_buffer, 0, 0, plane, plane + 1, NULL);
 
       av1_build_bitmask_vert_info(cm, &pd[plane], plane);
       av1_build_bitmask_horz_info(cm, &pd[plane], plane);
@@ -690,19 +712,20 @@
       // apply loop filtering which only goes through buffer once
       for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
         for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
-          av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col,
-                               plane, plane + 1);
+          av1_setup_dst_planes(pd, frame_buffer, mi_row, mi_col, plane,
+                               plane + 1, NULL);
           av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
                                               mi_col);
           if (mi_col - MI_SIZE_64X64 >= 0) {
-            av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
-                                 mi_col - MI_SIZE_64X64, plane, plane + 1);
+            av1_setup_dst_planes(pd, frame_buffer, mi_row,
+                                 mi_col - MI_SIZE_64X64, plane, plane + 1,
+                                 NULL);
             av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
                                                 mi_col - MI_SIZE_64X64);
           }
         }
-        av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
-                             mi_col - MI_SIZE_64X64, plane, plane + 1);
+        av1_setup_dst_planes(pd, frame_buffer, mi_row, mi_col - MI_SIZE_64X64,
+                             plane, plane + 1, NULL);
         av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
                                             mi_col - MI_SIZE_64X64);
       }
@@ -724,22 +747,21 @@
       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
           // filter vertical edges
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+          av1_setup_dst_planes(pd, frame_buffer, mi_row, mi_col, plane,
+                               plane + 1, NULL);
           av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
           // filter horizontal edges
           if (mi_col - MAX_MIB_SIZE >= 0) {
-            av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
-                                 mi_row, mi_col - MAX_MIB_SIZE, plane,
-                                 plane + 1);
+            av1_setup_dst_planes(pd, frame_buffer, mi_row,
+                                 mi_col - MAX_MIB_SIZE, plane, plane + 1, NULL);
             av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                         mi_col - MAX_MIB_SIZE);
           }
         }
         // filter horizontal edges
-        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                             mi_col - MAX_MIB_SIZE, plane, plane + 1);
+        av1_setup_dst_planes(pd, frame_buffer, mi_row, mi_col - MAX_MIB_SIZE,
+                             plane, plane + 1, NULL);
         av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                     mi_col - MAX_MIB_SIZE);
       }
@@ -747,8 +769,8 @@
       // filter all vertical edges in every 128x128 super block
       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+          av1_setup_dst_planes(pd, frame_buffer, mi_row, mi_col, plane,
+                               plane + 1, NULL);
           av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
         }
@@ -757,8 +779,8 @@
       // filter all horizontal edges in every 128x128 super block
       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+          av1_setup_dst_planes(pd, frame_buffer, mi_row, mi_col, plane,
+                               plane + 1, NULL);
           av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
         }
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index e8612f2..01e745b 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -15,6 +15,7 @@
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
+#include "av1/common/enums.h"
 
 PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
   if (!left_mi) return DC_PRED;
@@ -38,6 +39,113 @@
   return above_mi->mode;
 }
 
+void av1_reset_is_mi_coded_map(MACROBLOCKD *xd, int stride) {
+  av1_zero(xd->is_mi_coded);
+  xd->is_mi_coded_stride = stride;
+}
+
+void av1_mark_block_as_coded(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                             BLOCK_SIZE sb_size) {
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int sb_mi_size = mi_size_wide[sb_size];
+  const int mi_row_offset = mi_row & (sb_mi_size - 1);
+  const int mi_col_offset = mi_col & (sb_mi_size - 1);
+
+  for (int r = 0; r < mi_size_high[bsize]; ++r)
+    for (int c = 0; c < mi_size_wide[bsize]; ++c) {
+      const int pos =
+          (mi_row_offset + r) * xd->is_mi_coded_stride + mi_col_offset + c;
+#if CONFIG_SDP
+      switch (xd->tree_type) {
+        case SHARED_PART:
+          xd->is_mi_coded[0][pos] = 1;
+          xd->is_mi_coded[1][pos] = 1;
+          break;
+        case LUMA_PART: xd->is_mi_coded[0][pos] = 1; break;
+        case CHROMA_PART: xd->is_mi_coded[1][pos] = 1; break;
+        default: assert(0 && "Invalid tree type");
+      }
+#else
+      xd->is_mi_coded[pos] = 1;
+#endif  // CONFIG_SDP
+    }
+}
+
+void av1_mark_block_as_not_coded(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                 BLOCK_SIZE bsize, BLOCK_SIZE sb_size) {
+  const int sb_mi_size = mi_size_wide[sb_size];
+  const int mi_row_offset = mi_row & (sb_mi_size - 1);
+  const int mi_col_offset = mi_col & (sb_mi_size - 1);
+
+  for (int r = 0; r < mi_size_high[bsize]; ++r) {
+    const int pos =
+        (mi_row_offset + r) * xd->is_mi_coded_stride + mi_col_offset;
+#if CONFIG_SDP
+    uint8_t *row_ptr_luma = &xd->is_mi_coded[0][pos];
+    uint8_t *row_ptr_chroma = &xd->is_mi_coded[1][pos];
+    switch (xd->tree_type) {
+      case SHARED_PART:
+        av1_zero_array(row_ptr_luma, mi_size_wide[bsize]);
+        av1_zero_array(row_ptr_chroma, mi_size_wide[bsize]);
+        break;
+      case LUMA_PART: av1_zero_array(row_ptr_luma, mi_size_wide[bsize]); break;
+      case CHROMA_PART:
+        av1_zero_array(row_ptr_chroma, mi_size_wide[bsize]);
+        break;
+      default: assert(0 && "Invalid tree type");
+    }
+#else
+    uint8_t *row_ptr = &xd->is_mi_coded[pos];
+    av1_zero_array(row_ptr, mi_size_wide[bsize]);
+#endif  // CONFIG_SDP
+  }
+}
+
+PARTITION_TREE *av1_alloc_ptree_node(PARTITION_TREE *parent, int index) {
+  PARTITION_TREE *ptree = NULL;
+  struct aom_internal_error_info error;
+
+  AOM_CHECK_MEM_ERROR(&error, ptree, aom_calloc(1, sizeof(*ptree)));
+
+  ptree->parent = parent;
+  ptree->index = index;
+  ptree->partition = PARTITION_NONE;
+  ptree->is_settled = 0;
+  for (int i = 0; i < 4; ++i) ptree->sub_tree[i] = NULL;
+
+  return ptree;
+}
+
+void av1_free_ptree_recursive(PARTITION_TREE *ptree) {
+  if (ptree == NULL) return;
+
+  for (int i = 0; i < 4; ++i) {
+    av1_free_ptree_recursive(ptree->sub_tree[i]);
+    ptree->sub_tree[i] = NULL;
+  }
+
+  aom_free(ptree);
+}
+
+void av1_reset_ptree_in_sbi(SB_INFO *sbi
+#if CONFIG_SDP
+                            ,
+                            TREE_TYPE tree_type
+#endif  // CONFIG_SDP
+) {
+#if CONFIG_SDP
+  const int idx = av1_get_sdp_idx(tree_type);
+  if (sbi->ptree_root[idx]) av1_free_ptree_recursive(sbi->ptree_root[idx]);
+
+  sbi->ptree_root[idx] = av1_alloc_ptree_node(NULL, 0);
+#else
+  if (sbi->ptree_root) av1_free_ptree_recursive(sbi->ptree_root);
+
+  sbi->ptree_root = av1_alloc_ptree_node(NULL, 0);
+#endif  // CONFIG_SDP
+}
+
 void av1_set_entropy_contexts(const MACROBLOCKD *xd,
                               struct macroblockd_plane *pd, int plane,
                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
@@ -67,14 +175,42 @@
     memset(l, has_eob, sizeof(*l) * txs_high);
   }
 }
+
 void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                const int num_planes) {
-  assert(bsize < BLOCK_SIZES_ALL);
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  // TODO(chiyotsai): This part is needed to avoid encoder/decoder mismatch.
+  // Investigate why this is the case. It seems like on the decoder side, the
+  // decoder is failing to clear the context after encoding a skip_txfm chroma
+  // block.
+  const int plane_start = (xd->tree_type == CHROMA_PART);
+  int plane_end = 0;
+  switch (xd->tree_type) {
+    case LUMA_PART: plane_end = 1; break;
+    case CHROMA_PART: plane_end = num_planes; break;
+    case SHARED_PART:
+      plane_end = 1 + (num_planes - 1) * xd->is_chroma_ref;
+      break;
+    default: assert(0);
+  }
+  for (int i = plane_start; i < plane_end; ++i) {
+#else
   const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref;
   for (int i = 0; i < nplanes; i++) {
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
     struct macroblockd_plane *const pd = &xd->plane[i];
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+    const BLOCK_SIZE plane_bsize = get_mb_plane_block_size(
+        xd, xd->mi[0], i, pd->subsampling_x, pd->subsampling_y);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+    assert(plane_bsize ==
+           get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+    (void)bsize;
+#else
     const BLOCK_SIZE plane_bsize =
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+#endif  // CONFIG_SDP
     const int txs_wide = mi_size_wide[plane_bsize];
     const int txs_high = mi_size_high[plane_bsize];
     memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 97be296..63b40e0 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -237,6 +237,15 @@
   COMPOUND_TYPE type;
 } INTERINTER_COMPOUND_DATA;
 
+typedef struct CHROMA_REF_INFO {
+  int is_chroma_ref;
+  int offset_started;
+  int mi_row_chroma_base;
+  int mi_col_chroma_base;
+  BLOCK_SIZE bsize;
+  BLOCK_SIZE bsize_base;
+} CHROMA_REF_INFO;
+
 #define INTER_TX_SIZE_BUF_LEN 16
 #define TXK_TYPE_BUF_LEN 64
 /*!\endcond */
@@ -377,6 +386,8 @@
   uint8_t use_wedge_interintra : 1;
   /*! \brief CDEF strength per BLOCK_64X64 */
   int8_t cdef_strength : 4;
+  /*! \brief chroma block info for sub-8x8 cases */
+  CHROMA_REF_INFO chroma_ref_info;
 #if CONFIG_CCSO
   /*! \brief Whether to use cross-component sample offset for the U plane. */
   uint8_t ccso_blk_u : 2;
@@ -401,6 +412,38 @@
 
 /*!\cond */
 
+typedef struct PARTITION_TREE {
+  struct PARTITION_TREE *parent;
+  struct PARTITION_TREE *sub_tree[4];
+  PARTITION_TYPE partition;
+  BLOCK_SIZE bsize;
+  int is_settled;
+  int mi_row;
+  int mi_col;
+  int index;
+  CHROMA_REF_INFO chroma_ref_info;
+} PARTITION_TREE;
+
+PARTITION_TREE *av1_alloc_ptree_node(PARTITION_TREE *parent, int index);
+void av1_free_ptree_recursive(PARTITION_TREE *ptree);
+
+typedef struct SB_INFO {
+  int mi_row;
+  int mi_col;
+#if CONFIG_SDP
+  PARTITION_TREE *ptree_root[2];
+#else
+  PARTITION_TREE *ptree_root;
+#endif  // CONFIG_SDP
+} SB_INFO;
+
+void av1_reset_ptree_in_sbi(SB_INFO *sbi
+#if CONFIG_SDP
+                            ,
+                            TREE_TYPE tree_type
+#endif  // CONFIG_SDP
+);
+
 #if CONFIG_SDP
 static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi, int tree_type) {
   return mbmi->use_intrabc[tree_type == CHROMA_PART];
@@ -444,6 +487,30 @@
 }
 #endif
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static INLINE PARTITION_TYPE get_partition_from_symbol_rec_block(
+    BLOCK_SIZE bsize, PARTITION_TYPE_REC partition_rec) {
+  if (block_size_wide[bsize] > block_size_high[bsize])
+    return partition_map_from_symbol_block_wgth[partition_rec];
+  else if (block_size_high[bsize] > block_size_wide[bsize])
+    return partition_map_from_symbol_block_hgtw[partition_rec];
+  else
+    return PARTITION_INVALID;
+}
+
+static INLINE PARTITION_TYPE_REC get_symbol_from_partition_rec_block(
+    BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  assert(partition < EXT_PARTITION_TYPES);
+  if (block_size_wide[bsize] > block_size_high[bsize])
+    return symbol_map_from_partition_block_wgth[partition];
+  else if (block_size_high[bsize] > block_size_wide[bsize])
+    return symbol_map_from_partition_block_hgtw[partition];
+  else
+    return PARTITION_INVALID_REC;
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[1] > INTRA_FRAME;
 }
@@ -503,6 +570,356 @@
          block_size_allowed;
 }
 
+static INLINE int is_square_block(BLOCK_SIZE bsize) {
+  return block_size_high[bsize] == block_size_wide[bsize];
+}
+
+static INLINE int is_partition_point(BLOCK_SIZE bsize) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+  return bsize != BLOCK_4X4 && bsize < BLOCK_SIZES;
+#else
+  return is_square_block(bsize) && bsize >= BLOCK_8X8 && bsize < BLOCK_SIZES;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+}
+
+static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_4X4: return 0;
+    case BLOCK_8X8: return 1;
+    case BLOCK_16X16: return 2;
+    case BLOCK_32X32: return 3;
+    case BLOCK_64X64: return 4;
+    case BLOCK_128X128: return 5;
+    default: return SQR_BLOCK_SIZES;
+  }
+}
+
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
+// Note: the input block size should be square.
+// Otherwise it's considered invalid.
+static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
+                                               PARTITION_TYPE partition) {
+  if (partition == PARTITION_INVALID) {
+    return BLOCK_INVALID;
+  } else {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    if (is_partition_point(bsize))
+      return subsize_lookup[partition][bsize];
+    else
+      return partition == PARTITION_NONE ? bsize : BLOCK_INVALID;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+    const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
+    return sqr_bsize_idx >= SQR_BLOCK_SIZES
+               ? BLOCK_INVALID
+               : subsize_lookup[partition][sqr_bsize_idx];
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+  }
+}
+
+static INLINE int is_partition_valid(BLOCK_SIZE bsize, PARTITION_TYPE p) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (p == PARTITION_SPLIT) return 0;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+  if (is_partition_point(bsize))
+    return get_partition_subsize(bsize, p) < BLOCK_SIZES_ALL;
+  else
+    return p == PARTITION_NONE;
+}
+
+static INLINE void initialize_chr_ref_info(int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize,
+                                           CHROMA_REF_INFO *info) {
+  info->is_chroma_ref = 1;
+  info->offset_started = 0;
+  info->mi_row_chroma_base = mi_row;
+  info->mi_col_chroma_base = mi_col;
+  info->bsize = bsize;
+  info->bsize_base = bsize;
+}
+
+// Decide whether a block needs coding multiple chroma coding blocks in it at
+// once to get around sub-4x4 coding.
+static INLINE int have_nz_chroma_ref_offset(BLOCK_SIZE bsize,
+                                            PARTITION_TYPE partition,
+                                            int subsampling_x,
+                                            int subsampling_y) {
+  const int bw = block_size_wide[bsize] >> subsampling_x;
+  const int bh = block_size_high[bsize] >> subsampling_y;
+  const int bw_less_than_4 = bw < 4;
+  const int bh_less_than_4 = bh < 4;
+  const int hbw_less_than_4 = bw < 8;
+  const int hbh_less_than_4 = bh < 8;
+  const int qbw_less_than_4 = bw < 16;
+  const int qbh_less_than_4 = bh < 16;
+  switch (partition) {
+    case PARTITION_NONE: return bw_less_than_4 || bh_less_than_4;
+    case PARTITION_HORZ: return bw_less_than_4 || hbh_less_than_4;
+    case PARTITION_VERT: return hbw_less_than_4 || bh_less_than_4;
+    case PARTITION_SPLIT: return hbw_less_than_4 || hbh_less_than_4;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_3: return bw_less_than_4 || qbh_less_than_4;
+    case PARTITION_VERT_3: return qbw_less_than_4 || bh_less_than_4;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B: return hbw_less_than_4 || hbh_less_than_4;
+    case PARTITION_HORZ_4: return bw_less_than_4 || qbh_less_than_4;
+    case PARTITION_VERT_4: return qbw_less_than_4 || bh_less_than_4;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    default:
+      assert(0 && "Invalid partition type!");
+      return 0;
+      break;
+  }
+}
+
+// Decide whether a subblock is the main chroma reference when its parent block
+// needs coding multiple chroma coding blocks at once. The function returns a
+// flag indicating whether the mode info used for the combined chroma block is
+// located in the subblock.
+static INLINE int is_sub_partition_chroma_ref(PARTITION_TYPE partition,
+                                              int index, BLOCK_SIZE bsize,
+                                              BLOCK_SIZE parent_bsize, int ss_x,
+                                              int ss_y, int is_offset_started) {
+  (void)is_offset_started;
+  (void)parent_bsize;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int pw = bw >> ss_x;
+  const int ph = bh >> ss_y;
+  const int pw_less_than_4 = pw < 4;
+  const int ph_less_than_4 = ph < 4;
+  switch (partition) {
+    case PARTITION_NONE: return 1;
+    case PARTITION_HORZ:
+    case PARTITION_VERT: return index == 1;
+    case PARTITION_SPLIT:
+      if (is_offset_started) {
+        return index == 3;
+      } else {
+        if (pw_less_than_4 && ph_less_than_4)
+          return index == 3;
+        else if (pw_less_than_4)
+          return index == 1 || index == 3;
+        else if (ph_less_than_4)
+          return index == 2 || index == 3;
+        else
+          return 1;
+      }
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_VERT_3:
+    case PARTITION_HORZ_3: return index == 2;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+      if (is_offset_started) {
+        return index == 2;
+      } else {
+        const int smallest_w = block_size_wide[parent_bsize] >> (ss_x + 1);
+        const int smallest_h = block_size_high[parent_bsize] >> (ss_y + 1);
+        const int smallest_w_less_than_4 = smallest_w < 4;
+        const int smallest_h_less_than_4 = smallest_h < 4;
+        if (smallest_w_less_than_4 && smallest_h_less_than_4) {
+          return index == 2;
+        } else if (smallest_w_less_than_4) {
+          if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+            return index == 2;
+          } else if (partition == PARTITION_HORZ_A) {
+            return index == 1 || index == 2;
+          } else {
+            return index == 0 || index == 2;
+          }
+        } else if (smallest_h_less_than_4) {
+          if (partition == PARTITION_HORZ_A || partition == PARTITION_HORZ_B) {
+            return index == 2;
+          } else if (partition == PARTITION_VERT_A) {
+            return index == 1 || index == 2;
+          } else {
+            return index == 0 || index == 2;
+          }
+        } else {
+          return 1;
+        }
+      }
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      if (is_offset_started) {
+        return index == 3;
+      } else {
+        if ((partition == PARTITION_HORZ_4 && ph_less_than_4) ||
+            (partition == PARTITION_VERT_4 && pw_less_than_4)) {
+          return index == 1 || index == 3;
+        } else {
+          return 1;
+        }
+      }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    default:
+      assert(0 && "Invalid partition type!");
+      return 0;
+      break;
+  }
+}
+
+static INLINE void set_chroma_ref_offset_size(
+    int mi_row, int mi_col, PARTITION_TYPE partition, BLOCK_SIZE bsize,
+    BLOCK_SIZE parent_bsize, int ss_x, int ss_y, CHROMA_REF_INFO *info,
+    const CHROMA_REF_INFO *parent_info) {
+  const int pw = block_size_wide[bsize] >> ss_x;
+  const int ph = block_size_high[bsize] >> ss_y;
+  const int pw_less_than_4 = pw < 4;
+  const int ph_less_than_4 = ph < 4;
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  const int hppw = block_size_wide[parent_bsize] >> (ss_x + 1);
+  const int hpph = block_size_high[parent_bsize] >> (ss_y + 1);
+  const int hppw_less_than_4 = hppw < 4;
+  const int hpph_less_than_4 = hpph < 4;
+  const int mi_row_mid_point =
+      parent_info->mi_row_chroma_base + (mi_size_high[parent_bsize] >> 1);
+  const int mi_col_mid_point =
+      parent_info->mi_col_chroma_base + (mi_size_wide[parent_bsize] >> 1);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+  assert(parent_info->offset_started == 0);
+  switch (partition) {
+    case PARTITION_NONE:
+    case PARTITION_HORZ:
+    case PARTITION_VERT:
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_VERT_3:
+    case PARTITION_HORZ_3:
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+      info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+      info->bsize_base = parent_bsize;
+      break;
+    case PARTITION_SPLIT:
+      if (pw_less_than_4 && ph_less_than_4) {
+        info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+        info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+        info->bsize_base = parent_bsize;
+      } else if (pw_less_than_4) {
+        info->bsize_base = get_partition_subsize(parent_bsize, PARTITION_HORZ);
+        info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+        if (mi_row == parent_info->mi_row_chroma_base) {
+          info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+        } else {
+          info->mi_row_chroma_base =
+              parent_info->mi_row_chroma_base + mi_size_high[bsize];
+        }
+      } else {
+        assert(ph_less_than_4);
+        info->bsize_base = get_partition_subsize(parent_bsize, PARTITION_VERT);
+        info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+        if (mi_col == parent_info->mi_col_chroma_base) {
+          info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+        } else {
+          info->mi_col_chroma_base =
+              parent_info->mi_col_chroma_base + mi_size_wide[bsize];
+        }
+      }
+      break;
+#if !CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+      if ((hppw_less_than_4 && hpph_less_than_4) ||
+          (hppw_less_than_4 &&
+           (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B)) ||
+          (hpph_less_than_4 &&
+           (partition == PARTITION_HORZ_A || partition == PARTITION_HORZ_B))) {
+        info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+        info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+        info->bsize_base = parent_bsize;
+      } else if (hppw_less_than_4) {
+        info->bsize_base = get_partition_subsize(parent_bsize, PARTITION_HORZ);
+        info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+        if (mi_row == parent_info->mi_row_chroma_base) {
+          info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+        } else {
+          info->mi_row_chroma_base = parent_info->mi_row_chroma_base +
+                                     (mi_size_high[parent_bsize] >> 1);
+        }
+      } else {
+        assert(hpph_less_than_4);
+        info->bsize_base = get_partition_subsize(parent_bsize, PARTITION_VERT);
+        info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+        if (mi_col == parent_info->mi_col_chroma_base) {
+          info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+        } else {
+          info->mi_col_chroma_base = parent_info->mi_col_chroma_base +
+                                     (mi_size_wide[parent_bsize] >> 1);
+        }
+      }
+      break;
+    case PARTITION_HORZ_4:
+      info->bsize_base = get_partition_subsize(parent_bsize, PARTITION_HORZ);
+      info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+      if (mi_row < mi_row_mid_point) {
+        info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+      } else {
+        info->mi_row_chroma_base = mi_row_mid_point;
+      }
+      break;
+    case PARTITION_VERT_4:
+      info->bsize_base = get_partition_subsize(parent_bsize, PARTITION_VERT);
+      info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+      if (mi_col < mi_col_mid_point) {
+        info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+      } else {
+        info->mi_col_chroma_base = mi_col_mid_point;
+      }
+      break;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+    default: assert(0 && "Invalid partition type!"); break;
+  }
+}
+
+static INLINE void set_chroma_ref_info(int mi_row, int mi_col, int index,
+                                       BLOCK_SIZE bsize, CHROMA_REF_INFO *info,
+                                       const CHROMA_REF_INFO *parent_info,
+                                       BLOCK_SIZE parent_bsize,
+                                       PARTITION_TYPE parent_partition,
+                                       int ss_x, int ss_y) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  initialize_chr_ref_info(mi_row, mi_col, bsize, info);
+  if (parent_info == NULL) return;
+  if (parent_info->is_chroma_ref) {
+    if (parent_info->offset_started) {
+      if (is_sub_partition_chroma_ref(parent_partition, index, bsize,
+                                      parent_bsize, ss_x, ss_y, 1)) {
+        info->is_chroma_ref = 1;
+      } else {
+        info->is_chroma_ref = 0;
+      }
+      info->offset_started = 1;
+      info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+      info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+      info->bsize_base = parent_info->bsize_base;
+    } else if (have_nz_chroma_ref_offset(parent_bsize, parent_partition, ss_x,
+                                         ss_y)) {
+      info->offset_started = 1;
+      info->is_chroma_ref = is_sub_partition_chroma_ref(
+          parent_partition, index, bsize, parent_bsize, ss_x, ss_y, 0);
+      set_chroma_ref_offset_size(mi_row, mi_col, parent_partition, bsize,
+                                 parent_bsize, ss_x, ss_y, info, parent_info);
+    }
+  } else {
+    info->is_chroma_ref = 0;
+    info->offset_started = 1;
+    info->mi_row_chroma_base = parent_info->mi_row_chroma_base;
+    info->mi_col_chroma_base = parent_info->mi_col_chroma_base;
+    info->bsize_base = parent_info->bsize_base;
+  }
+}
+
 #if CONFIG_MISMATCH_DEBUG
 static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
                                    int mi_row, int tx_blk_col, int tx_blk_row,
@@ -745,6 +1162,11 @@
   MB_MODE_INFO *chroma_above_mbmi;
 
   /*!
+   * SB_INFO for the superblock that the current coding block is located in
+   */
+  SB_INFO *sbi;
+
+  /*!
    * Appropriate offset based on current 'mi_row' and 'mi_col', inside
    * 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
    * 'MACROBLOCK' structs.
@@ -772,7 +1194,22 @@
    * block shares the same tree or not.
    */
   TREE_TYPE tree_type;
-#endif
+
+  /*!
+   * An array for recording whether an mi(4x4) is coded. Reset at sb level.
+   */
+  // TODO(any): Convert to bit field instead.
+  uint8_t is_mi_coded[2][MAX_MIB_SQUARE];
+#else
+  /*!
+   * An array for recording whether an mi(4x4) is coded. Reset at sb level.
+   */
+  uint8_t is_mi_coded[MAX_MIB_SQUARE];
+#endif  // CONFIG_SDP
+  /*!
+   * Stride of the is_mi_coded array.
+   */
+  int is_mi_coded_stride;
 
   /*!
    * Scale factors for reference frames of the current block.
@@ -1061,37 +1498,6 @@
              : buf16;
 }
 
-static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
-  switch (bsize) {
-    case BLOCK_4X4: return 0;
-    case BLOCK_8X8: return 1;
-    case BLOCK_16X16: return 2;
-    case BLOCK_32X32: return 3;
-    case BLOCK_64X64: return 4;
-    case BLOCK_128X128: return 5;
-    default: return SQR_BLOCK_SIZES;
-  }
-}
-
-// For a square block size 'bsize', returns the size of the sub-blocks used by
-// the given partition type. If the partition produces sub-blocks of different
-// sizes, then the function returns the largest sub-block size.
-// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
-// Conversion tables).
-// Note: the input block size should be square.
-// Otherwise it's considered invalid.
-static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
-                                               PARTITION_TYPE partition) {
-  if (partition == PARTITION_INVALID) {
-    return BLOCK_INVALID;
-  } else {
-    const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
-    return sqr_bsize_idx >= SQR_BLOCK_SIZES
-               ? BLOCK_INVALID
-               : subsize_lookup[partition][sqr_bsize_idx];
-  }
-}
-
 static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
                                      PLANE_TYPE plane_type) {
   static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
@@ -1266,6 +1672,74 @@
   return ss_size_lookup[bsize][subsampling_x][subsampling_y];
 }
 
+#if CONFIG_SDP
+static INLINE int av1_get_sdp_idx(TREE_TYPE tree_type) {
+  switch (tree_type) {
+    case SHARED_PART:
+    case LUMA_PART: return 0;
+    case CHROMA_PART: return 1; break;
+    default: assert(0 && "Invalid tree type"); return 0;
+  }
+}
+#endif  // CONFIG_SDP
+
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+static INLINE BLOCK_SIZE get_bsize_base(const MACROBLOCKD *xd,
+                                        const MB_MODE_INFO *mbmi, int plane) {
+  BLOCK_SIZE bsize_base = BLOCK_INVALID;
+#if CONFIG_SDP
+  if (xd->tree_type == SHARED_PART) {
+    bsize_base =
+        plane ? mbmi->chroma_ref_info.bsize_base : mbmi->sb_type[PLANE_TYPE_Y];
+  } else {
+    bsize_base = mbmi->sb_type[av1_get_sdp_idx(xd->tree_type)];
+  }
+#else
+  bsize_base = plane ? mbmi->chroma_ref_info.bsize_base : mbmi->sb_type;
+  (void)xd;
+#endif  // CONFIG_SDP
+  return bsize_base;
+}
+
+static INLINE BLOCK_SIZE get_mb_plane_block_size(const MACROBLOCKD *xd,
+                                                 const MB_MODE_INFO *mbmi,
+                                                 int plane, int subsampling_x,
+                                                 int subsampling_y) {
+  assert(subsampling_x >= 0 && subsampling_x < 2);
+  assert(subsampling_y >= 0 && subsampling_y < 2);
+  const BLOCK_SIZE bsize_base = get_bsize_base(xd, mbmi, plane);
+  return get_plane_block_size(bsize_base, subsampling_x, subsampling_y);
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+
+#if CONFIG_SDP
+// These are only needed to support lpf multi-thread.
+// Because xd is shared among all the threads workers, xd->tree_type does not
+// contain the valid tree_type, so we are passing in the tree_type
+static INLINE BLOCK_SIZE get_bsize_base_from_tree_type(const MB_MODE_INFO *mbmi,
+                                                       TREE_TYPE tree_type,
+                                                       int plane) {
+  BLOCK_SIZE bsize_base = BLOCK_INVALID;
+  if (tree_type == SHARED_PART) {
+    bsize_base =
+        plane ? mbmi->chroma_ref_info.bsize_base : mbmi->sb_type[PLANE_TYPE_Y];
+  } else {
+    bsize_base = mbmi->sb_type[av1_get_sdp_idx(tree_type)];
+  }
+  return bsize_base;
+}
+
+static INLINE BLOCK_SIZE get_mb_plane_block_size_from_tree_type(
+    const MB_MODE_INFO *mbmi, TREE_TYPE tree_type, int plane, int subsampling_x,
+    int subsampling_y) {
+  assert(subsampling_x >= 0 && subsampling_x < 2);
+  assert(subsampling_y >= 0 && subsampling_y < 2);
+  const BLOCK_SIZE bsize_base =
+      get_bsize_base_from_tree_type(mbmi, tree_type, plane);
+  return get_plane_block_size(bsize_base, subsampling_x, subsampling_y);
+}
+#endif  // CONFIG_SDP
+
 /*
  * Logic to generate the lookup tables:
  *
@@ -1471,7 +1945,7 @@
     if (is_inter_block(mbmi, xd->tree_type)) {
 #else
     if (is_inter_block(mbmi)) {
-#endif  // CONFIG_SDP
+#endif
       // scale back to y plane's coordinate
       const struct macroblockd_plane *const pd = &xd->plane[plane_type];
       blk_row <<= pd->subsampling_y;
@@ -1492,7 +1966,7 @@
                                 reduced_tx_set);
 #else
         av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
-#endif  // CONFIG_SDP
+#endif
     if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
   }
 #if CONFIG_IST
@@ -1597,12 +2071,16 @@
   if (xd->lossless[mbmi->segment_id]) return TX_4X4;
   if (plane == 0) return mbmi->tx_size;
   const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
-#if CONFIG_SDP
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE bsize_base = get_bsize_base(xd, mbmi, plane);
+  return av1_get_max_uv_txsize(bsize_base, pd->subsampling_x,
+                               pd->subsampling_y);
+#elif CONFIG_SDP
   return av1_get_max_uv_txsize(mbmi->sb_type[PLANE_TYPE_UV], pd->subsampling_x,
                                pd->subsampling_y);
 #else
-  return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
-                               pd->subsampling_y);
+  return av1_get_max_uv_txsize(mbmi->chroma_ref_info.bsize_base,
+                               pd->subsampling_x, pd->subsampling_y);
 #endif
 }
 
@@ -1623,6 +2101,12 @@
                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                               int has_eob, int aoff, int loff);
 
+void av1_reset_is_mi_coded_map(MACROBLOCKD *xd, int stride);
+void av1_mark_block_as_coded(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                             BLOCK_SIZE sb_size);
+void av1_mark_block_as_not_coded(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                 BLOCK_SIZE bsize, BLOCK_SIZE sb_size);
+
 #define MAX_INTERINTRA_SB_SQUARE 32 * 32
 static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
   return (mbmi->ref_frame[0] > INTRA_FRAME &&
@@ -1677,9 +2161,27 @@
   return av1_get_adjusted_tx_size(max_txsize);  // chroma
 }
 
-static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize,
+                                                    int mi_row, int mi_col) {
   assert(bsize < BLOCK_SIZES_ALL);
-  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+
+  if (AOMMIN(block_size_wide[bsize], block_size_high[bsize]) < 8) {
+    return 0;
+  }
+#if CONFIG_EXT_RECUR_PARTITIONS
+  // TODO(urvang): Enable this special case, if we make OBMC work.
+  // TODO(yuec): Enable this case when the alignment issue is fixed. There
+  // will be memory leak in global above_pred_buff and left_pred_buff if
+  // the restriction on mi_row and mi_col is removed.
+  if ((mi_row & 0x01) || (mi_col & 0x01)) {
+    return 0;
+  }
+#else
+  assert(!(mi_row & 0x01) && !(mi_col & 0x01));
+  (void)mi_row;
+  (void)mi_col;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+  return 1;
 }
 
 static INLINE int is_motion_variation_allowed_compound(
@@ -1703,9 +2205,11 @@
     if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
   }
 #if CONFIG_SDP
-  if (is_motion_variation_allowed_bsize(mbmi->sb_type[PLANE_TYPE_Y]) &&
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type[PLANE_TYPE_Y], xd->mi_row,
+                                        xd->mi_col) &&
 #else
-  if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type, xd->mi_row,
+                                        xd->mi_col) &&
 #endif
       is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
       is_motion_variation_allowed_compound(mbmi)) {
@@ -1752,6 +2256,7 @@
                                             int *height,
                                             int *rows_within_bounds,
                                             int *cols_within_bounds) {
+  if (plane > 0) bsize = xd->mi[0]->chroma_ref_info.bsize_base;
   const int block_height = block_size_high[bsize];
   const int block_width = block_size_wide[bsize];
   const int block_rows = (xd->mb_to_bottom_edge >= 0)
@@ -1842,6 +2347,18 @@
   }
   return tx_size_2d[tx_size];
 }
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+static AOM_INLINE const PARTITION_TREE *get_partition_subtree_const(
+    const PARTITION_TREE *partition_tree, int idx) {
+  if (!partition_tree) {
+    return NULL;
+  }
+  return partition_tree->sub_tree[idx];
+}
+
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 /*!\endcond */
 
 #ifdef __cplusplus
diff --git a/av1/common/ccso.c b/av1/common/ccso.c
index 5596ceb..5f8e44c 100644
--- a/av1/common/ccso.c
+++ b/av1/common/ccso.c
@@ -271,8 +271,7 @@
 void ccso_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd,
                 uint16_t *ext_rec_y) {
   const int num_planes = av1_num_planes(cm);
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
+  av1_setup_dst_planes(xd->plane, frame, 0, 0, 0, num_planes, NULL);
 
   const uint8_t quant_sz[4] = { 16, 8, 32, 64 };
   for (int plane = 1; plane < 3; plane++) {
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index ffb0159..14ec3bb 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -142,8 +142,7 @@
   int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
   const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
+  av1_setup_dst_planes(xd->plane, frame, 0, 0, 0, num_planes, NULL);
   row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
   memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
   prev_row_cdef = row_cdef + 1;
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 718f6c0..1df05bb 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -372,41 +372,21 @@
   }
 }
 
-// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
-// and non-chroma-referenced blocks are stored together in the CfL buffer.
-static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row,
-                                        int mi_col, int *row_out,
-                                        int *col_out) {
-  // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
-  if ((mi_row & 0x01) && cfl->subsampling_y) {
-    assert(*row_out == 0);
-    (*row_out)++;
-  }
-
-  // Increment col index for right: 4x8, 4x16 or both right 4x4s.
-  if ((mi_col & 0x01) && cfl->subsampling_x) {
-    assert(*col_out == 0);
-    (*col_out)++;
-  }
-}
-
-void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
-                  BLOCK_SIZE bsize) {
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size) {
   CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
   uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int row_offset = mi_row - xd->mi[0]->chroma_ref_info.mi_row_chroma_base;
+  const int col_offset = mi_col - xd->mi[0]->chroma_ref_info.mi_col_chroma_base;
 
-  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
-    // Only dimensions of size 4 can have an odd offset.
-    assert(!((col & 1) && tx_size_wide[tx_size] != 4));
-    assert(!((row & 1) && tx_size_high[tx_size] != 4));
-    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
-  }
 #if CONFIG_SDP
-  cfl_store(xd, cfl, dst, pd->dst.stride, row, col, tx_size,
-            is_cur_buf_hbd(xd));
+  cfl_store(xd, cfl, dst, pd->dst.stride, row + row_offset, col + col_offset,
+            tx_size, is_cur_buf_hbd(xd));
 #else
-  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
+  cfl_store(cfl, dst, pd->dst.stride, row + row_offset, col + col_offset,
+            tx_size, is_cur_buf_hbd(xd));
 #endif
 }
 
@@ -429,21 +409,20 @@
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
   CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
-  int row = 0;
-  int col = 0;
-
-  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
-    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
-  }
   const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int row_offset = mi_row - xd->mi[0]->chroma_ref_info.mi_row_chroma_base;
+  const int col_offset = mi_col - xd->mi[0]->chroma_ref_info.mi_col_chroma_base;
+
   tx_size = get_tx_size(width, height);
   assert(tx_size != TX_INVALID);
 #if CONFIG_SDP
-  cfl_store(xd, cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
-            is_cur_buf_hbd(xd));
+  cfl_store(xd, cfl, pd->dst.buf, pd->dst.stride, row_offset, col_offset,
+            tx_size, is_cur_buf_hbd(xd));
 #else
-  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
+  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row_offset, col_offset, tx_size,
             is_cur_buf_hbd(xd));
 #endif
 }
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 2463e2d..f9c11c6 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -20,9 +20,7 @@
   const MB_MODE_INFO *mbmi = xd->mi[0];
 #if CONFIG_SDP
   if (xd->tree_type == LUMA_PART) return CFL_DISALLOWED;
-  const BLOCK_SIZE bsize =
-      mbmi->sb_type[xd->tree_type == SHARED_PART ? PLANE_TYPE_Y
-                                                 : PLANE_TYPE_UV];
+  const BLOCK_SIZE bsize = get_bsize_base(xd, mbmi, AOM_PLANE_U);
 #else
   const BLOCK_SIZE bsize = mbmi->sb_type;
 #endif
@@ -82,8 +80,7 @@
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
 
-void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
-                  BLOCK_SIZE bsize);
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size);
 
 void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
                        CFL_PRED_TYPE pred_plane, int width);
diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index ca47b4a..88ce99a 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h
@@ -12,6 +12,9 @@
 #ifndef AOM_AV1_COMMON_COMMON_DATA_H_
 #define AOM_AV1_COMMON_COMMON_DATA_H_
 
+#include <assert.h>
+#include <stdbool.h>
+
 #include "av1/common/enums.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
@@ -65,6 +68,112 @@
   4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
 };
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static const PARTITION_TYPE
+    partition_map_from_symbol_block_wgth[PARTITION_TYPES_REC] = {
+      PARTITION_NONE,
+      PARTITION_VERT,
+      PARTITION_VERT_3,
+      PARTITION_HORZ,
+    };
+
+static const PARTITION_TYPE_REC
+    symbol_map_from_partition_block_wgth[EXT_PARTITION_TYPES] = {
+      PARTITION_NONE_REC,        PARTITION_SHORT_SIDE_2_REC,
+      PARTITION_LONG_SIDE_2_REC, PARTITION_INVALID_REC,
+      PARTITION_LONG_SIDE_3_REC,
+    };
+
+static const PARTITION_TYPE
+    partition_map_from_symbol_block_hgtw[PARTITION_TYPES_REC] = {
+      PARTITION_NONE,
+      PARTITION_HORZ,
+      PARTITION_HORZ_3,
+      PARTITION_VERT,
+    };
+
+static const PARTITION_TYPE_REC
+    symbol_map_from_partition_block_hgtw[EXT_PARTITION_TYPES] = {
+      PARTITION_NONE_REC,         PARTITION_LONG_SIDE_2_REC,
+      PARTITION_SHORT_SIDE_2_REC, PARTITION_LONG_SIDE_3_REC,
+      PARTITION_INVALID_REC,
+    };
+
+/* clang-format off */
+// This table covers all square blocks and 1:2/2:1 rectangular blocks
+static const BLOCK_SIZE
+    subsize_lookup[EXT_PARTITION_TYPES + 1][BLOCK_SIZES_ALL] = {
+  {     // PARTITION_NONE
+    BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64,
+    BLOCK_64X32, BLOCK_64X64, BLOCK_64X128, BLOCK_128X64, BLOCK_128X128,
+    BLOCK_4X16, BLOCK_16X4, BLOCK_8X32, BLOCK_32X8, BLOCK_16X64, BLOCK_64X16,
+  }, {  // PARTITION_HORZ
+    BLOCK_INVALID, BLOCK_4X4, BLOCK_INVALID, BLOCK_8X4, BLOCK_8X8, BLOCK_16X4,
+    BLOCK_16X8, BLOCK_16X16, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32, BLOCK_64X16,
+    BLOCK_64X32, BLOCK_64X64, BLOCK_INVALID, BLOCK_128X64,
+    BLOCK_4X8, BLOCK_INVALID, BLOCK_8X16, BLOCK_INVALID, BLOCK_16X32,
+    BLOCK_INVALID,
+  }, {  // PARTITION_VERT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, BLOCK_4X8, BLOCK_4X16, BLOCK_8X8,
+    BLOCK_8X16, BLOCK_8X32, BLOCK_16X16, BLOCK_16X32, BLOCK_16X64, BLOCK_32X32,
+    BLOCK_32X64, BLOCK_INVALID, BLOCK_64X64, BLOCK_64X128,
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_INVALID, BLOCK_16X8, BLOCK_INVALID,
+    BLOCK_32X16,
+  }, {  // PARTITION_HORZ_3
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    BLOCK_INVALID, BLOCK_16X4, BLOCK_16X8,
+    BLOCK_INVALID,
+    BLOCK_32X8, BLOCK_32X16,
+    BLOCK_INVALID,
+    BLOCK_64X16, BLOCK_64X32, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X4, BLOCK_INVALID, BLOCK_8X8, BLOCK_INVALID, BLOCK_16X16,
+    BLOCK_INVALID,
+  }, {  // PARTITION_VERT_3
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8, BLOCK_4X16,
+    BLOCK_INVALID,
+    BLOCK_8X16, BLOCK_8X32,
+    BLOCK_INVALID,
+    BLOCK_16X32, BLOCK_16X64, BLOCK_INVALID, BLOCK_32X64, BLOCK_INVALID,
+    BLOCK_INVALID, BLOCK_4X4, BLOCK_INVALID, BLOCK_8X8,
+    BLOCK_INVALID, BLOCK_16X16,
+  }, {  // PARTITION_SPLIT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, BLOCK_INVALID,
+    BLOCK_INVALID, BLOCK_8X8, BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X64, BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_INVALID, BLOCK_INVALID,
+  },
+};
+
+#if CONFIG_SDP
+static AOM_INLINE PARTITION_TYPE sdp_chroma_part_from_luma(BLOCK_SIZE bsize,
+                                         PARTITION_TYPE luma_part, int ssx,
+                                         int ssy) {
+  const int bh_chr = block_size_high[bsize] >> ssy;
+  const int bw_chr = block_size_wide[bsize] >> ssx;
+
+  switch (luma_part) {
+    case PARTITION_NONE: return PARTITION_NONE;
+    case PARTITION_HORZ: return (bh_chr < 8) ? PARTITION_NONE : PARTITION_HORZ;
+    case PARTITION_HORZ_3:
+      if (bh_chr >= 16)
+        return PARTITION_HORZ_3;
+      else
+        return (bh_chr < 8) ? PARTITION_NONE : PARTITION_HORZ;
+    case PARTITION_VERT: return (bw_chr < 8) ? PARTITION_NONE : PARTITION_VERT;
+    case PARTITION_VERT_3:
+      if (bw_chr >= 16)
+        return PARTITION_VERT_3;
+      else
+        return (bw_chr < 8) ? PARTITION_NONE : PARTITION_VERT;
+    default: assert(0);
+  }
+  return PARTITION_INVALID;
+}
+#endif  // CONFIG_SDP
+#else  // CONFIG_EXT_RECUR_PARTITIONS
 // A compressed version of the Partition_Subsize table in the spec (9.3.
 // Conversion tables), for square block sizes only.
 /* clang-format off */
@@ -101,6 +210,7 @@
     BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID
   }
 };
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
 static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
   //                   4X4
@@ -468,6 +578,22 @@
   { 13, 3 },
 };
 
+static AOM_INLINE bool is_bsize_geq(BLOCK_SIZE bsize1, BLOCK_SIZE bsize2) {
+  if (bsize1 == BLOCK_INVALID || bsize2 == BLOCK_INVALID) {
+    return false;
+  }
+  return block_size_wide[bsize1] >= block_size_wide[bsize2] &&
+         block_size_high[bsize1] >= block_size_high[bsize2];
+}
+
+static AOM_INLINE bool is_bsize_gt(BLOCK_SIZE bsize1, BLOCK_SIZE bsize2) {
+  if (bsize1 == BLOCK_INVALID || bsize2 == BLOCK_INVALID) {
+    return false;
+  }
+  return block_size_wide[bsize1] > block_size_wide[bsize2] &&
+         block_size_high[bsize1] > block_size_high[bsize2];
+}
+
 #if CONFIG_IST
 // Mapping of intra modes to IST kernel set
 // Secondary transforms are enabled only intra modes < PAETH_PRED.
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 1396f9a..44c40e6 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -204,7 +204,64 @@
         { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248,
                     9875, 10521, 29048) } }
     };
-#if CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+static const aom_cdf_prob
+    default_partition_cdf[PARTITION_STRUCTURE_NUM][PARTITION_CONTEXTS][CDF_SIZE(
+        EXT_PARTITION_TYPES)] = { {
+                                      // 8x8
+                                      { AOM_CDF3(22187, 28914) },
+                                      { AOM_CDF3(17354, 25544) },
+                                      { AOM_CDF3(16287, 28824) },
+                                      { AOM_CDF3(15189, 27217) },
+                                      // 16x16
+                                      { AOM_CDF5(14665, 22357, 28960, 30645) },
+                                      { AOM_CDF5(7804, 13703, 27420, 29025) },
+                                      { AOM_CDF5(7508, 23180, 28044, 30882) },
+                                      { AOM_CDF5(5058, 16122, 25275, 28359) },
+                                      // 32x32
+                                      { AOM_CDF5(11795, 19886, 27120, 29401) },
+                                      { AOM_CDF5(5127, 12682, 26374, 28387) },
+                                      { AOM_CDF5(5522, 19614, 27318, 30668) },
+                                      { AOM_CDF5(3450, 12856, 24163, 28493) },
+                                      // 64x64
+                                      { AOM_CDF5(21562, 26118, 30872, 31711) },
+                                      { AOM_CDF5(5489, 14515, 28365, 29969) },
+                                      { AOM_CDF5(5873, 19195, 28209, 31603) },
+                                      { AOM_CDF5(1674, 15579, 28805, 31560) },
+                                      // 128x128
+                                      { AOM_CDF3(25710, 28640) },
+                                      { AOM_CDF3(7561, 14721) },
+                                      { AOM_CDF3(9603, 21021) },
+                                      { AOM_CDF3(1736, 12989) },
+                                  },
+                                  {
+                                      // 8x8
+                                      { AOM_CDF3(22187, 28914) },
+                                      { AOM_CDF3(17354, 25544) },
+                                      { AOM_CDF3(16287, 28824) },
+                                      { AOM_CDF3(15189, 27217) },
+                                      // 16x16
+                                      { AOM_CDF5(14665, 22357, 28960, 30645) },
+                                      { AOM_CDF5(7804, 13703, 27420, 29025) },
+                                      { AOM_CDF5(7508, 23180, 28044, 30882) },
+                                      { AOM_CDF5(5058, 16122, 25275, 28359) },
+                                      // 32x32
+                                      { AOM_CDF5(11795, 19886, 27120, 29401) },
+                                      { AOM_CDF5(5127, 12682, 26374, 28387) },
+                                      { AOM_CDF5(5522, 19614, 27318, 30668) },
+                                      { AOM_CDF5(3450, 12856, 24163, 28493) },
+                                      // 64x64
+                                      { AOM_CDF5(21562, 26118, 30872, 31711) },
+                                      { AOM_CDF5(5489, 14515, 28365, 29969) },
+                                      { AOM_CDF5(5873, 19195, 28209, 31603) },
+                                      { AOM_CDF5(1674, 15579, 28805, 31560) },
+                                      // 128x128
+                                      { AOM_CDF3(25710, 28640) },
+                                      { AOM_CDF3(7561, 14721) },
+                                      { AOM_CDF3(9603, 21021) },
+                                      { AOM_CDF3(1736, 12989) },
+                                  } };
+#elif CONFIG_SDP
 static const aom_cdf_prob
     default_partition_cdf[PARTITION_STRUCTURE_NUM][PARTITION_CONTEXTS][CDF_SIZE(
         EXT_PARTITION_TYPES)] = {
@@ -277,7 +334,36 @@
           { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) },
       }
     };
-#else
+#elif CONFIG_EXT_RECUR_PARTITIONS
+static const aom_cdf_prob
+    default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)] = {
+      // 8x8
+      { AOM_CDF3(22187, 28914) },
+      { AOM_CDF3(17354, 25544) },
+      { AOM_CDF3(16287, 28824) },
+      { AOM_CDF3(15189, 27217) },
+      // 16x16
+      { AOM_CDF5(14665, 22357, 28960, 30645) },
+      { AOM_CDF5(7804, 13703, 27420, 29025) },
+      { AOM_CDF5(7508, 23180, 28044, 30882) },
+      { AOM_CDF5(5058, 16122, 25275, 28359) },
+      // 32x32
+      { AOM_CDF5(11795, 19886, 27120, 29401) },
+      { AOM_CDF5(5127, 12682, 26374, 28387) },
+      { AOM_CDF5(5522, 19614, 27318, 30668) },
+      { AOM_CDF5(3450, 12856, 24163, 28493) },
+      // 64x64
+      { AOM_CDF5(21562, 26118, 30872, 31711) },
+      { AOM_CDF5(5489, 14515, 28365, 29969) },
+      { AOM_CDF5(5873, 19195, 28209, 31603) },
+      { AOM_CDF5(1674, 15579, 28805, 31560) },
+      // 128x128
+      { AOM_CDF3(25710, 28640) },
+      { AOM_CDF3(7561, 14721) },
+      { AOM_CDF3(9603, 21021) },
+      { AOM_CDF3(1736, 12989) },
+    };
+#else  // !CONFIG_EXT_RECUR_PARTITIONS && !CONFIG_SDP
 static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(
     EXT_PARTITION_TYPES)] = {
   { AOM_CDF4(19132, 25510, 30392) },
@@ -303,6 +389,38 @@
 };
 #endif
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static const aom_cdf_prob
+    default_partition_rec_cdf[PARTITION_CONTEXTS_REC]
+                             [CDF_SIZE(PARTITION_TYPES_REC)] = {
+                               // 8x4, 4x8
+                               { AOM_CDF2(30462) },
+                               { AOM_CDF2(25506) },
+                               { AOM_CDF2(27632) },
+                               { AOM_CDF2(19443) },
+                               // 16x8, 8x16
+                               { AOM_CDF4(19986, 29676, 30790) },
+                               { AOM_CDF4(12672, 24996, 30937) },
+                               { AOM_CDF4(16895, 30097, 30155) },
+                               { AOM_CDF4(10676, 22283, 25595) },
+                               // 32x16, 16x32
+                               { AOM_CDF4(13648, 24298, 31008) },
+                               { AOM_CDF4(6941, 18823, 31326) },
+                               { AOM_CDF4(8081, 19947, 30935) },
+                               { AOM_CDF4(4728, 17352, 30577) },
+                               // 64x32, 32x64
+                               { AOM_CDF4(14004, 23614, 30662) },
+                               { AOM_CDF4(5530, 18449, 30965) },
+                               { AOM_CDF4(6144, 19185, 31435) },
+                               { AOM_CDF4(7382, 19434, 30389) },
+                               // 128x64, 64x128
+                               { AOM_CDF3(15208, 24398) },
+                               { AOM_CDF3(6597, 18232) },
+                               { AOM_CDF3(9068, 21038) },
+                               { AOM_CDF3(10923, 21845) },
+                             };
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 static const aom_cdf_prob default_intra_ext_tx_cdf
     [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
       {
@@ -1399,6 +1517,9 @@
 #endif
   av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
   av1_copy(fc->partition_cdf, default_partition_cdf);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  av1_copy(fc->partition_rec_cdf, default_partition_rec_cdf);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
   av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
   av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 235542b..1711602 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -161,7 +161,11 @@
                             [CDF_SIZE(EXT_PARTITION_TYPES)];
 #else
   aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)];
-#endif
+#endif  // CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS
+  aom_cdf_prob partition_rec_cdf[PARTITION_CONTEXTS_REC]
+                                [CDF_SIZE(PARTITION_TYPES_REC)];
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
                                     [CDF_SIZE(SWITCHABLE_FILTERS)];
   /* kf_y_cdf is discarded after use, so does not require persistent storage.
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 697fdee..743a5dc 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -50,6 +50,8 @@
 #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
 #define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
 
+#define MAX_MIB_SQUARE (MAX_MIB_SIZE * MAX_MIB_SIZE)
+
 // MI-units per min superblock
 #define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
 
@@ -165,6 +167,10 @@
 // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
 #define SQR_BLOCK_SIZES 6
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+#define KEEP_PARTITION_SPLIT 0
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 //  Partition types.  R: Recursive
 //
 //  NONE          HORZ          VERT          SPLIT
@@ -174,6 +180,17 @@
 //  |       |     |       |     |   |   |     | R | R |
 //  +-------+     +-------+     +---+---+     +---+---+
 //
+#if CONFIG_EXT_RECUR_PARTITIONS
+//  HORZ_3                 VERT_3
+//  +--------------+       +---+------+---+
+//  |              |       |   |      |   |
+//  +--------------+       |   |      |   |
+//  |              |       |   |      |   |
+//  |              |       |   |      |   |
+//  +--------------+       |   |      |   |
+//  |              |       |   |      |   |
+//  +--------------+       +---+------+---+
+#else
 //  HORZ_A        HORZ_B        VERT_A        VERT_B
 //  +---+---+     +-------+     +---+---+     +---+---+
 //  |   |   |     |       |     |   |   |     |   |   |
@@ -186,6 +203,20 @@
 //  +-----+       | | | |
 //  +-----+       | | | |
 //  +-----+       +-+-+-+
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_EXT_RECUR_PARTITIONS
+enum {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_HORZ_3,  // 3 horizontal sub-partitions with ratios 4:1, 2:1 and 4:1
+  PARTITION_VERT_3,  // 3 vertical sub-partitions with ratios 4:1, 2:1 and 4:1
+  EXT_PARTITION_TYPES,
+  PARTITION_SPLIT = EXT_PARTITION_TYPES,
+  PARTITION_TYPES = PARTITION_VERT + 1,
+  PARTITION_INVALID = 255
+} UENUM1BYTE(PARTITION_TYPE);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
 enum {
   PARTITION_NONE,
   PARTITION_HORZ,
@@ -201,12 +232,27 @@
   PARTITION_TYPES = PARTITION_SPLIT + 1,
   PARTITION_INVALID = 255
 } UENUM1BYTE(PARTITION_TYPE);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET 4  // number of probability models per block size
 #define PARTITION_BLOCK_SIZES 5
 #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+enum {
+  PARTITION_NONE_REC,
+  PARTITION_LONG_SIDE_2_REC,
+  PARTITION_LONG_SIDE_3_REC,
+  PARTITION_SHORT_SIDE_2_REC,
+  PARTITION_TYPES_REC = PARTITION_SHORT_SIDE_2_REC + 1,
+  PARTITION_INVALID_REC = 255
+} UENUM1BYTE(PARTITION_TYPE_REC);
+
+#define PARTITION_BLOCK_SIZES_REC 5  // 128x64, 64x32, 32x16, 16x8, 8x4
+#define PARTITION_CONTEXTS_REC (PARTITION_BLOCK_SIZES_REC * PARTITION_PLOFFSET)
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 // block transform size
 enum {
   TX_4X4,             // 4x4 transform
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index bd02cf3..de7eb9c 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -337,10 +337,14 @@
 }
 
 static AOM_INLINE void scan_row_mbmi(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col,
-    const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack,
-    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
-    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset,
+    const AV1_COMMON *cm, const MACROBLOCKD *xd,
+#if CONFIG_EXT_RECUR_PARTITIONS
+    int mi_row,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset,
+    CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, uint8_t *refmv_count,
+    uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates,
+    int max_row_offset,
 #if CONFIG_SMVP_IMPROVEMENT
     int add_more_mvs, SINGLE_MV_CANDIDATE *single_mv, uint8_t *single_mv_count,
     CANDIDATE_MV *derived_mv_stack, uint16_t *derived_mv_weight,
@@ -363,6 +367,24 @@
   const int plane_type = (xd->tree_type == CHROMA_PART);
 #endif
   for (int i = 0; i < end_mi;) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+    const int mask_row = mi_row & (sb_mi_size - 1);
+    const int mask_col = mi_col & (sb_mi_size - 1);
+    const int ref_mask_row = mask_row + row_offset;
+    const int ref_mask_col = mask_col + col_offset + i;
+    if (ref_mask_row >= 0) {
+      if (ref_mask_col >= sb_mi_size) break;
+
+      const int ref_offset =
+          ref_mask_row * xd->is_mi_coded_stride + ref_mask_col;
+#if CONFIG_SDP
+      if (!xd->is_mi_coded[0][ref_offset]) break;
+#else
+      if (!xd->is_mi_coded[ref_offset]) break;
+#endif  // CONFIG_SDP
+    }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
 #if CONFIG_SDP
     const int candidate_bsize = candidate->sb_type[plane_type];
@@ -401,6 +423,9 @@
 
 static AOM_INLINE void scan_col_mbmi(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row,
+#if CONFIG_EXT_RECUR_PARTITIONS
+    int mi_col,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack,
     uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
     uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset,
@@ -423,6 +448,23 @@
   const int use_step_16 = (xd->height >= 16);
 
   for (i = 0; i < end_mi;) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+    const int mask_row = mi_row & (sb_mi_size - 1);
+    const int mask_col = mi_col & (sb_mi_size - 1);
+    const int ref_mask_row = mask_row + row_offset + i;
+    const int ref_mask_col = mask_col + col_offset;
+    if (ref_mask_col >= 0) {
+      if (ref_mask_row >= sb_mi_size) break;
+      const int ref_offset =
+          ref_mask_row * xd->is_mi_coded_stride + ref_mask_col;
+#if CONFIG_SDP
+      if (!xd->is_mi_coded[0][ref_offset]) break;
+#else
+      if (!xd->is_mi_coded[ref_offset]) break;
+#endif  // CONFIG_SDP
+    }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
 #if CONFIG_SDP
@@ -494,6 +536,44 @@
   }  // Analyze a single 8x8 block motion information.
 }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                         int mi_row, int mi_col, int n4_w) {
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int mask_row = mi_row & (sb_mi_size - 1);
+  const int mask_col = mi_col & (sb_mi_size - 1);
+
+  if (n4_w > mi_size_wide[BLOCK_64X64]) return 0;
+
+  const int tr_mask_row = mask_row - 1;
+  const int tr_mask_col = mask_col + n4_w;
+  int has_tr;
+
+  if (tr_mask_row < 0) {
+    // The top-right block is in a superblock above the current sb row. If it is
+    // in the current tile or a previously coded one, it has been coded.
+    // Otherwise later the tile boundary checker will figure out whether it is
+    // available.
+    has_tr = 1;
+  } else if (tr_mask_col >= sb_mi_size) {
+    // The top-right block is in the superblock on the right side, therefore it
+    // is not coded yet.
+    has_tr = 0;
+  } else {
+    // For a general case, we use is_mi_coded array for the current superblock
+    // to figure out the availability.
+    const int tr_offset = tr_mask_row * xd->is_mi_coded_stride + tr_mask_col;
+
+#if CONFIG_SDP
+    has_tr = xd->is_mi_coded[av1_get_sdp_idx(xd->tree_type)][tr_offset];
+#else
+    has_tr = xd->is_mi_coded[tr_offset];
+#endif  // CONFIG_SDP
+  }
+
+  return has_tr;
+}
+#else
 static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                          int mi_row, int mi_col, int bs) {
   const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
@@ -545,6 +625,7 @@
 
   return has_tr;
 }
+#endif
 
 static int check_sb_border(const int mi_row, const int mi_col,
                            const int row_offset, const int col_offset) {
@@ -719,8 +800,12 @@
     uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
     int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
     int mi_row, int mi_col, int16_t *mode_context) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int has_tr = has_top_right(cm, xd, mi_row, mi_col, xd->width);
+#else
   const int bs = AOMMAX(xd->width, xd->height);
   const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
+#endif
   MV_REFERENCE_FRAME rf[2];
 
   const TileInfo *const tile = &xd->tile;
@@ -775,8 +860,12 @@
 
   // Scan the first above row mode info. row_offset = -1;
   if (abs(max_row_offset) >= 1)
-    scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight,
-                  refmv_count, &row_match_count, &newmv_count, gm_mv_candidates,
+    scan_row_mbmi(cm, xd,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                  mi_row,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                  mi_col, rf, -1, ref_mv_stack, ref_mv_weight, refmv_count,
+                  &row_match_count, &newmv_count, gm_mv_candidates,
                   max_row_offset,
 #if CONFIG_SMVP_IMPROVEMENT
                   1, single_mv, &single_mv_count, derived_mv_stack,
@@ -786,8 +875,12 @@
 
   // Scan the first left column mode info. col_offset = -1;
   if (abs(max_col_offset) >= 1)
-    scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight,
-                  refmv_count, &col_match_count, &newmv_count, gm_mv_candidates,
+    scan_col_mbmi(cm, xd, mi_row,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                  mi_col,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                  rf, -1, ref_mv_stack, ref_mv_weight, refmv_count,
+                  &col_match_count, &newmv_count, gm_mv_candidates,
                   max_col_offset,
 #if CONFIG_SMVP_IMPROVEMENT
                   1, single_mv, &single_mv_count, derived_mv_stack,
@@ -875,11 +968,15 @@
     const int col_offset = -(idx << 1) + 1 + col_adj;
     if (abs(col_offset) <= abs(max_col_offset) &&
         abs(col_offset) > processed_cols) {
-      scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
-                    refmv_count, &col_match_count, &dummy_newmv_count,
-                    gm_mv_candidates, max_col_offset, 0, single_mv,
-                    &single_mv_count, derived_mv_stack, derived_mv_weight,
-                    &derived_mv_count, &processed_cols);
+      scan_col_mbmi(cm, xd, mi_row,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                    mi_col,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                    rf, col_offset, ref_mv_stack, ref_mv_weight, refmv_count,
+                    &col_match_count, &dummy_newmv_count, gm_mv_candidates,
+                    max_col_offset, 0, single_mv, &single_mv_count,
+                    derived_mv_stack, derived_mv_weight, &derived_mv_count,
+                    &processed_cols);
     }
   }
 #else
@@ -889,15 +986,23 @@
 
     if (abs(row_offset) <= abs(max_row_offset) &&
         abs(row_offset) > processed_rows)
-      scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
+      scan_row_mbmi(cm, xd,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                    mi_row,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                    mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
                     refmv_count, &row_match_count, &dummy_newmv_count,
                     gm_mv_candidates, max_row_offset, &processed_rows);
 
     if (abs(col_offset) <= abs(max_col_offset) &&
         abs(col_offset) > processed_cols)
-      scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
-                    refmv_count, &col_match_count, &dummy_newmv_count,
-                    gm_mv_candidates, max_col_offset, &processed_cols);
+      scan_col_mbmi(cm, xd, mi_row,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                    mi_col,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                    rf, col_offset, ref_mv_stack, ref_mv_weight, refmv_count,
+                    &col_match_count, &dummy_newmv_count, gm_mv_candidates,
+                    max_col_offset, &processed_cols);
   }
 #endif  // CONFIG_SMVP_IMPROVEMENT
 
@@ -1815,8 +1920,12 @@
   assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   // Top-right block
-  if (do_tr &&
-      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (do_tr && has_top_right(cm, xd, mi_row, mi_col, xd->width)) {
+#else
+  const int bs = AOMMAX(xd->width, xd->height);
+  if (do_tr && has_top_right(cm, xd, mi_row, mi_col, bs)) {
+#endif
     const POSITION trb_pos = { -1, xd->width };
     const TileInfo *const tile = &xd->tile;
     if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index b03024e..b830750 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -765,27 +765,30 @@
 //  2. At least one dimension is size 4 with subsampling
 //  3. If sub-sampled, none of the previous blocks around the sub-sample
 //     are intrabc or inter-blocks
-static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
-                            int is_intrabc, int build_for_obmc) {
+static bool is_sub8x8_inter(const MACROBLOCKD *xd, const MB_MODE_INFO *mi,
+                            int plane, int is_intrabc, int build_for_obmc) {
   if (is_intrabc || build_for_obmc) {
     return false;
   }
 
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x;
-  const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y;
-  if (!is_sub4_x && !is_sub4_y) {
+#if CONFIG_SDP
+  if (!(plane &&
+        (mi->sb_type[PLANE_TYPE_UV] != mi->chroma_ref_info.bsize_base)))
     return false;
-  }
+#else
+  if (!(plane && (mi->sb_type != mi->chroma_ref_info.bsize_base))) return false;
+#endif  // CONFIG_SDP
 
   // For sub8x8 chroma blocks, we may be covering more than one luma block's
   // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
   // the top-left corner of the prediction source - the correct top-left corner
   // is at (pre_x, pre_y).
-  const int row_start = is_sub4_y ? -1 : 0;
-  const int col_start = is_sub4_x ? -1 : 0;
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int row_start =
+      plane ? mi->chroma_ref_info.mi_row_chroma_base - mi_row : 0;
+  const int col_start =
+      plane ? mi->chroma_ref_info.mi_col_chroma_base - mi_col : 0;
 
   for (int row = row_start; row <= 0; ++row) {
     for (int col = col_start; col <= 0; ++col) {
@@ -816,11 +819,9 @@
   const bool ss_y = pd->subsampling_y;
   const int b4_w = block_size_wide[bsize] >> ss_x;
   const int b4_h = block_size_high[bsize] >> ss_y;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
-  const int b8_w = block_size_wide[plane_bsize];
-  const int b8_h = block_size_high[plane_bsize];
-  const int is_compound = has_second_ref(mi);
-  assert(!is_compound);
+  const BLOCK_SIZE plane_bsize = plane ? mi->chroma_ref_info.bsize_base : bsize;
+  const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+  const int b8_h = block_size_high[plane_bsize] >> ss_y;
 #if CONFIG_SDP
   assert(!is_intrabc_block(mi, xd->tree_type));
 #else
@@ -831,8 +832,10 @@
   // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
   // the top-left corner of the prediction source - the correct top-left corner
   // is at (pre_x, pre_y).
-  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
-  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+  const int row_start =
+      plane ? (mi->chroma_ref_info.mi_row_chroma_base - xd->mi_row) : 0;
+  const int col_start =
+      plane ? (mi->chroma_ref_info.mi_col_chroma_base - xd->mi_col) : 0;
   const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
   const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
 
@@ -841,6 +844,13 @@
     int col = col_start;
     for (int x = 0; x < b8_w; x += b4_w) {
       MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+#if CONFIG_EXT_RECUR_PARTITIONS
+      // TODO(yuec): enabling compound prediction in none sub8x8 mbs in the
+      // group
+      bool is_compound = 0;
+#else
+      bool is_compound = has_second_ref(this_mbmi);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       struct buf_2d *const dst_buf = &pd->dst;
       uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
       int ref = 0;
@@ -858,7 +868,6 @@
       };
 
       const MV mv = this_mbmi->mv[ref].as_mv;
-
       InterPredParams inter_pred_params;
       av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
                             pre_x + x, pd->subsampling_x, pd->subsampling_y,
@@ -881,9 +890,9 @@
                                     &inter_pred_params, xd, mi_x + x, mi_y + y,
                                     ref, mc_buf, calc_subpel_params_func);
 
-      ++col;
+      col += mi_size_wide[bsize];
     }
-    ++row;
+    row += mi_size_high[bsize];
   }
 }
 
@@ -907,19 +916,17 @@
     const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
     is_global[ref] = is_global_mv_block(mi, wm->wmtype);
   }
-#if CONFIG_SDP
-  const BLOCK_SIZE bsize = mi->sb_type[PLANE_TYPE_Y];
-#else
-  const BLOCK_SIZE bsize = mi->sb_type;
-#endif
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  const int row_start =
-      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
-  const int col_start =
-      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  int row_start = 0;
+  int col_start = 0;
+  if (!build_for_obmc) {
+    const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+    const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+    row_start = plane ? (mi->chroma_ref_info.mi_row_chroma_base - mi_row) : 0;
+    col_start = plane ? (mi->chroma_ref_info.mi_col_chroma_base - mi_col) : 0;
+  }
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> pd->subsampling_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> pd->subsampling_y;
 
   for (int ref = 0; ref < 1 + is_compound; ++ref) {
     const struct scale_factors *const sf =
@@ -983,13 +990,14 @@
                                 int mi_y, uint8_t **mc_buf,
                                 CalcSubpelParamsFunc calc_subpel_params_func) {
 #if CONFIG_SDP
-  if (is_sub8x8_inter(xd, plane, mi->sb_type[PLANE_TYPE_Y],
-                      is_intrabc_block(mi, xd->tree_type), build_for_obmc)) {
-#else
-  if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi),
+  if (is_sub8x8_inter(xd, mi, plane, is_intrabc_block(mi, xd->tree_type),
                       build_for_obmc)) {
+#else
+  if (is_sub8x8_inter(xd, mi, plane, is_intrabc_block(mi), build_for_obmc)) {
 #endif
+#if !CONFIG_EXT_RECUR_PARTITIONS
     assert(bw < 8 || bh < 8);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
     build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf,
                                   calc_subpel_params_func);
   } else {
@@ -998,41 +1006,37 @@
                                           calc_subpel_params_func);
   }
 }
-void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
-                          const int plane_start, const int plane_end) {
+                          const int plane_start, const int plane_end,
+                          const CHROMA_REF_INFO *chr_ref_info) {
   // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
   // the static analysis warnings.
   for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
     struct macroblockd_plane *const pd = &planes[i];
     const int is_uv = i > 0;
-    setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
+    setup_pred_plane(&pd->dst, src->buffers[i], src->crop_widths[is_uv],
                      src->crop_heights[is_uv], src->strides[is_uv], mi_row,
-                     mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
+                     mi_col, NULL, pd->subsampling_x, pd->subsampling_y,
+                     chr_ref_info);
   }
 }
 
 void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
-                          const struct scale_factors *sf,
-                          const int num_planes) {
+                          const struct scale_factors *sf, const int num_planes,
+                          const CHROMA_REF_INFO *chr_ref_info) {
   if (src != NULL) {
     // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
     // the static analysis warnings.
     for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
       const int is_uv = i > 0;
-#if CONFIG_SDP
-      setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type[PLANE_TYPE_Y],
-                       src->buffers[i], src->crop_widths[is_uv],
+      setup_pred_plane(&pd->pre[idx], src->buffers[i], src->crop_widths[is_uv],
                        src->crop_heights[is_uv], src->strides[is_uv], mi_row,
-                       mi_col, sf, pd->subsampling_x, pd->subsampling_y);
-#else
-      setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i],
-                       src->crop_widths[is_uv], src->crop_heights[is_uv],
-                       src->strides[is_uv], mi_row, mi_col, sf,
-                       pd->subsampling_x, pd->subsampling_y);
-#endif
+                       mi_col, sf, pd->subsampling_x, pd->subsampling_y,
+                       chr_ref_info);
     }
   }
 }
@@ -1093,9 +1097,12 @@
   mbmi->overlappable_neighbors[0] = 0;
   mbmi->overlappable_neighbors[1] = 0;
 #if CONFIG_SDP
-  if (!is_motion_variation_allowed_bsize(mbmi->sb_type[PLANE_TYPE_Y])) return;
+  if (!is_motion_variation_allowed_bsize(mbmi->sb_type[PLANE_TYPE_Y],
+                                         xd->mi_row, xd->mi_col))
+    return;
 #else
-  if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
+  if (!is_motion_variation_allowed_bsize(mbmi->sb_type, xd->mi_row, xd->mi_col))
+    return;
 #endif
 
   foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
@@ -1111,8 +1118,6 @@
 
 int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
                                const struct macroblockd_plane *pd, int dir) {
-  assert(is_motion_variation_allowed_bsize(bsize));
-
   const BLOCK_SIZE bsize_plane =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   switch (bsize_plane) {
@@ -1274,21 +1279,15 @@
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
     const int num_planes) {
-#if CONFIG_SDP
-  const BLOCK_SIZE a_bsize =
-      AOMMAX(BLOCK_8X8, above_mbmi->sb_type[PLANE_TYPE_Y]);
-#else
-  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
-#endif
   const int above_mi_col = xd->mi_col + rel_mi_col;
 
   av1_modify_neighbor_predictor_for_obmc(above_mbmi);
 
   for (int j = 0; j < num_planes; ++j) {
     struct macroblockd_plane *const pd = &xd->plane[j];
-    setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+    setup_pred_plane(&pd->dst, ctxt->tmp_buf[j], ctxt->tmp_width[j],
                      ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
-                     NULL, pd->subsampling_x, pd->subsampling_y);
+                     NULL, pd->subsampling_x, pd->subsampling_y, NULL);
   }
 
   const int num_refs = 1 + has_second_ref(above_mbmi);
@@ -1304,7 +1303,7 @@
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf,
-                         num_planes);
+                         num_planes, NULL);
   }
 
   xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
@@ -1318,21 +1317,15 @@
                                              MB_MODE_INFO *left_mbmi,
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes) {
-#if CONFIG_SDP
-  const BLOCK_SIZE l_bsize =
-      AOMMAX(BLOCK_8X8, left_mbmi->sb_type[PLANE_TYPE_Y]);
-#else
-  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
-#endif
   const int left_mi_row = xd->mi_row + rel_mi_row;
 
   av1_modify_neighbor_predictor_for_obmc(left_mbmi);
 
   for (int j = 0; j < num_planes; ++j) {
     struct macroblockd_plane *const pd = &xd->plane[j];
-    setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+    setup_pred_plane(&pd->dst, ctxt->tmp_buf[j], ctxt->tmp_width[j],
                      ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
-                     NULL, pd->subsampling_x, pd->subsampling_y);
+                     NULL, pd->subsampling_x, pd->subsampling_y, NULL);
   }
 
   const int num_refs = 1 + has_second_ref(left_mbmi);
@@ -1349,7 +1342,7 @@
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col,
-                         ref_scale_factors, num_planes);
+                         ref_scale_factors, num_planes, NULL);
   }
 
   xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row));
@@ -1412,15 +1405,27 @@
                             bd);
 }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd, int plane,
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride) {
+#else
 void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                MACROBLOCKD *xd,
                                                BLOCK_SIZE bsize, int plane,
                                                const BUFFER_SET *ctx,
                                                uint8_t *dst, int dst_stride) {
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ssx = xd->plane[plane].subsampling_x;
   const int ssy = xd->plane[plane].subsampling_y;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  BLOCK_SIZE plane_bsize =
+      get_mb_plane_block_size(xd, xd->mi[0], plane, ssx, ssy);
+#else
   BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
   assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
   assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
@@ -1446,7 +1451,15 @@
                             const uint8_t *intra_pred, int intra_stride) {
   const int ssx = xd->plane[plane].subsampling_x;
   const int ssy = xd->plane[plane].subsampling_y;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+  BLOCK_SIZE plane_bsize =
+      get_mb_plane_block_size(xd, xd->mi[0], plane, ssx, ssy);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  assert(plane_bsize == get_plane_block_size(bsize, ssx, ssy));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+#else
+  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   if (is_cur_buf_hbd(xd)) {
     combine_interintra_highbd(
@@ -1472,15 +1485,25 @@
   assert(bsize < BLOCK_SIZES_ALL);
   if (is_cur_buf_hbd(xd)) {
     DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
+#if CONFIG_EXT_RECUR_PARTITIONS
+    av1_build_intra_predictors_for_interintra(
+        cm, xd, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+#else
     av1_build_intra_predictors_for_interintra(
         cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
         MAX_SB_SIZE);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     av1_combine_interintra(xd, bsize, plane, pred, stride,
                            CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
   } else {
     DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
+#if CONFIG_EXT_RECUR_PARTITIONS
+    av1_build_intra_predictors_for_interintra(cm, xd, plane, ctx,
+                                              intrapredictor, MAX_SB_SIZE);
+#else
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
                                               intrapredictor, MAX_SB_SIZE);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
                            MAX_SB_SIZE);
   }
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 6e6ab2c..323940f 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -298,16 +298,21 @@
   return (int64_t)y * stride + x;
 }
 
-static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
-                                    uint8_t *src, int width, int height,
-                                    int stride, int mi_row, int mi_col,
+static INLINE void setup_pred_plane(struct buf_2d *dst, uint8_t *src, int width,
+                                    int height, int stride, int mi_row,
+                                    int mi_col,
                                     const struct scale_factors *scale,
-                                    int subsampling_x, int subsampling_y) {
+                                    int subsampling_x, int subsampling_y,
+                                    const CHROMA_REF_INFO *chr_ref_info) {
   // Offset the buffer pointer
-  if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
-    mi_row -= 1;
-  if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
-    mi_col -= 1;
+  /*  if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+      mi_row -= 1;
+    if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+      mi_col -= 1;*/
+  if (chr_ref_info && (subsampling_x || subsampling_y)) {
+    mi_row = chr_ref_info->mi_row_chroma_base;
+    mi_col = chr_ref_info->mi_col_chroma_base;
+  }
 
   const int x = (MI_SIZE * mi_col) >> subsampling_x;
   const int y = (MI_SIZE * mi_row) >> subsampling_y;
@@ -318,13 +323,15 @@
   dst->stride = stride;
 }
 
-void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+void av1_setup_dst_planes(struct macroblockd_plane *planes,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
-                          const int plane_start, const int plane_end);
+                          const int plane_start, const int plane_end,
+                          const CHROMA_REF_INFO *chr_ref_info);
 
 void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
-                          const struct scale_factors *sf, const int num_planes);
+                          const struct scale_factors *sf, const int num_planes,
+                          const CHROMA_REF_INFO *chr_ref_info);
 
 static INLINE void set_default_interp_filters(
     MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
@@ -393,11 +400,18 @@
                                     const BUFFER_SET *ctx, int plane,
                                     BLOCK_SIZE bsize);
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd, int plane,
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride);
+#else
 void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                MACROBLOCKD *xd,
                                                BLOCK_SIZE bsize, int plane,
                                                const BUFFER_SET *ctx,
                                                uint8_t *dst, int dst_stride);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 73197b8..36f4288 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -65,6 +65,97 @@
   NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // PAETH
 };
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col,
+                         int top_available, int right_available, TX_SIZE txsz,
+                         int row_off, int col_off, int ss_x, int ss_y,
+                         int px_to_right_edge, int *px_top_right,
+                         int is_bsize_altered_for_chroma) {
+  if (!top_available || !right_available) return 0;
+
+  const int bw_unit = mi_size_wide[bsize];
+  const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
+  const int top_right_count_unit = tx_size_wide_unit[txsz];
+  const int px_tr_common = AOMMIN(tx_size_wide[txsz], px_to_right_edge);
+
+  if (px_tr_common <= 0) return 0;
+
+  *px_top_right = px_tr_common;
+
+  if (row_off > 0) {  // Just need to check if enough pixels on the right.
+    if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
+      // Special case: For 128x128 blocks, the transform unit whose
+      // top-right corner is at the center of the block does in fact have
+      // pixels available at its top-right corner.
+      if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
+          col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
+        return 1;
+      }
+      const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+      const int col_off_64 = col_off % plane_bw_unit_64;
+      return col_off_64 + top_right_count_unit < plane_bw_unit_64;
+    }
+    return col_off + top_right_count_unit < plane_bw_unit;
+  } else {
+    // All top-right pixels are in the block above, which is already available.
+    if (col_off + top_right_count_unit < plane_bw_unit) return 1;
+
+    // Handle the top-right intra tx block of the coding block
+    const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+    const int mi_row_aligned =
+        is_bsize_altered_for_chroma
+            ? xd->mi[0]->chroma_ref_info.mi_row_chroma_base
+            : mi_row;
+    const int mi_col_aligned =
+        is_bsize_altered_for_chroma
+            ? xd->mi[0]->chroma_ref_info.mi_col_chroma_base
+            : mi_col;
+    const int tr_mask_row = (mi_row_aligned & (sb_mi_size - 1)) - 1;
+    const int tr_mask_col =
+        (mi_col_aligned & (sb_mi_size - 1)) + mi_size_wide[bsize];
+
+    if (tr_mask_row < 0) {
+      return 1;
+    } else if (tr_mask_col >= sb_mi_size) {
+      return 0;
+    } else {  // Handle the general case: the top_right mi is in the same SB
+      const int tr_offset = tr_mask_row * xd->is_mi_coded_stride + tr_mask_col;
+      // As long as the first mi is available, we determine tr is available
+#if CONFIG_SDP
+      int has_tr = xd->is_mi_coded[av1_get_sdp_idx(xd->tree_type)][tr_offset];
+#else
+      int has_tr = xd->is_mi_coded[tr_offset];
+#endif  // CONFIG_SDP
+
+      // Calculate px_top_right: how many top-right pixels are available. If it
+      // is less than tx_size_wide[txsz], px_top_right will be used to
+      // determine the location of the last available pixel, which will be used
+      // for padding.
+      if (has_tr) {
+        int mi_tr = 0;
+        for (int i = 0; i < top_right_count_unit << ss_x; ++i) {
+          if ((tr_mask_col + i) >= sb_mi_size ||
+#if CONFIG_SDP
+              !xd->is_mi_coded[av1_get_sdp_idx(xd->tree_type)][tr_offset + i]
+#else
+              !xd->is_mi_coded[tr_offset + i]
+#endif  // CONFIG_SDP
+          ) {
+            break;
+          } else {
+            mi_tr++;
+          }
+        }
+
+        *px_top_right = AOMMIN((mi_tr << MI_SIZE_LOG2) >> ss_x, px_tr_common);
+      }
+
+      return has_tr;
+    }
+  }
+}
+#else
 // Tables to store if the top-right reference pixels are available. The flags
 // are represented with bits, packed into 8-bit integers. E.g., for the 32x32
 // blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster
@@ -260,142 +351,22 @@
     return (has_tr_table[idx1] >> idx2) & 1;
   }
 }
+#endif
 
-// Similar to the has_tr_* tables, but store if the bottom-left reference
-// pixels are available.
-static uint8_t has_bl_4x4[128] = {
-  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85,
-  85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,  0,  84, 85, 85, 85, 16, 17,
-  17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84,
-  85, 85, 85, 0,  0,  0,  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85,
-  0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,
-  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85,
-  85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  0,  0,
-};
-static uint8_t has_bl_4x8[64] = {
-  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
-  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
-  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
-  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
-};
-static uint8_t has_bl_8x4[64] = {
-  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
-  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
-  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
-  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
-};
-static uint8_t has_bl_8x8[32] = {
-  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
-  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
-};
-static uint8_t has_bl_8x16[16] = {
-  16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0,
-};
-static uint8_t has_bl_16x8[16] = {
-  254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0,
-};
-static uint8_t has_bl_16x16[8] = {
-  84, 16, 84, 0, 84, 16, 84, 0,
-};
-static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 };
-static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 };
-static uint8_t has_bl_32x32[2] = { 4, 4 };
-static uint8_t has_bl_32x64[1] = { 0 };
-static uint8_t has_bl_64x32[1] = { 34 };
-static uint8_t has_bl_64x64[1] = { 0 };
-static uint8_t has_bl_64x128[1] = { 0 };
-static uint8_t has_bl_128x64[1] = { 0 };
-static uint8_t has_bl_128x128[1] = { 0 };
-static uint8_t has_bl_4x16[32] = {
-  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
-  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
-};
-static uint8_t has_bl_16x4[32] = {
-  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
-  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
-};
-static uint8_t has_bl_8x32[8] = {
-  0, 1, 0, 0, 0, 1, 0, 0,
-};
-static uint8_t has_bl_32x8[8] = {
-  238, 78, 238, 14, 238, 78, 238, 14,
-};
-static uint8_t has_bl_16x64[2] = { 0, 0 };
-static uint8_t has_bl_64x16[2] = { 42, 42 };
-
-static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = {
-  // 4X4
-  has_bl_4x4,
-  // 4X8,         8X4,         8X8
-  has_bl_4x8, has_bl_8x4, has_bl_8x8,
-  // 8X16,        16X8,        16X16
-  has_bl_8x16, has_bl_16x8, has_bl_16x16,
-  // 16X32,       32X16,       32X32
-  has_bl_16x32, has_bl_32x16, has_bl_32x32,
-  // 32X64,       64X32,       64X64
-  has_bl_32x64, has_bl_64x32, has_bl_64x64,
-  // 64x128,      128x64,      128x128
-  has_bl_64x128, has_bl_128x64, has_bl_128x128,
-  // 4x16,        16x4,        8x32
-  has_bl_4x16, has_bl_16x4, has_bl_8x32,
-  // 32x8,        16x64,       64x16
-  has_bl_32x8, has_bl_16x64, has_bl_64x16
-};
-
-static uint8_t has_bl_vert_8x8[32] = {
-  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
-  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
-};
-static uint8_t has_bl_vert_16x16[8] = {
-  254, 16, 254, 0, 254, 16, 254, 0,
-};
-static uint8_t has_bl_vert_32x32[2] = { 14, 14 };
-static uint8_t has_bl_vert_64x64[1] = { 2 };
-
-// The _vert_* tables are like the ordinary tables above, but describe the
-// order we visit square blocks when doing a PARTITION_VERT_A or
-// PARTITION_VERT_B. This is the same order as normal except for on the last
-// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
-// as a pair of squares, which means that these tables work correctly for both
-// mixed vertical partition types.
-//
-// There are tables for each of the square sizes. Vertical rectangles (like
-// BLOCK_16X32) use their respective "non-vert" table
-static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = {
-  // 4X4
-  NULL,
-  // 4X8,     8X4,         8X8
-  has_bl_4x8, NULL, has_bl_vert_8x8,
-  // 8X16,    16X8,        16X16
-  has_bl_8x16, NULL, has_bl_vert_16x16,
-  // 16X32,   32X16,       32X32
-  has_bl_16x32, NULL, has_bl_vert_32x32,
-  // 32X64,   64X32,       64X64
-  has_bl_32x64, NULL, has_bl_vert_64x64,
-  // 64x128,  128x64,      128x128
-  has_bl_64x128, NULL, has_bl_128x128
-};
-
-static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
-                                       BLOCK_SIZE bsize) {
-  const uint8_t *ret = NULL;
-  // If this is a mixed vertical partition, look up bsize in orders_vert.
-  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
-    assert(bsize < BLOCK_SIZES);
-    ret = has_bl_vert_tables[bsize];
-  } else {
-    ret = has_bl_tables[bsize];
-  }
-  assert(ret);
-  return ret;
-}
-
-static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
-                           int mi_col, int bottom_available, int left_available,
-                           PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
-                           int col_off, int ss_x, int ss_y) {
+static int has_bottom_left(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col,
+                           int bottom_available, int left_available,
+                           TX_SIZE txsz, int row_off, int col_off, int ss_x,
+                           int ss_y, int px_to_bottom_edge, int *px_bottom_left,
+                           int is_bsize_altered_for_chroma) {
   if (!bottom_available || !left_available) return 0;
 
+  const int px_bl_common = AOMMIN(tx_size_high[txsz], px_to_bottom_edge);
+
+  if (px_bl_common <= 0) return 0;
+
+  *px_bottom_left = px_bl_common;
+
   // Special case for 128x* blocks, when col_off is half the block width.
   // This is needed because 128x* superblocks are divided into 64x* blocks in
   // raster order
@@ -425,37 +396,70 @@
     // All bottom-left pixels are in the left block, which is already available.
     if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
 
-    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
-    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    // The general case: neither the leftmost column nor the bottom row. The
+    // bottom-left mi is in the same SB
     const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
-    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
-    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+    const int mi_row_aligned =
+        is_bsize_altered_for_chroma
+            ? xd->mi[0]->chroma_ref_info.mi_row_chroma_base
+            : mi_row;
+    const int mi_col_aligned =
+        is_bsize_altered_for_chroma
+            ? xd->mi[0]->chroma_ref_info.mi_col_chroma_base
+            : mi_col;
+    const int bl_mask_row =
+        (mi_row_aligned & (sb_mi_size - 1)) + mi_size_high[bsize];
+    const int bl_mask_col = (mi_col_aligned & (sb_mi_size - 1)) - 1;
 
-    // Leftmost column of superblock: so bottom-left pixels maybe in the left
-    // and/or bottom-left superblocks. But only the left superblock is
-    // available, so check if all required pixels fall in that superblock.
-    if (blk_col_in_sb == 0) {
-      const int blk_start_row_off =
-          blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
+    if (bl_mask_col < 0) {
+      const int plane_sb_height =
+          block_size_high[cm->seq_params.sb_size] >> ss_y;
+      const int plane_bottom_row =
+          (((mi_row_aligned & (sb_mi_size - 1)) << MI_SIZE_LOG2) +
+           block_size_high[bsize]) >>
           ss_y;
-      const int row_off_in_sb = blk_start_row_off + row_off;
-      const int sb_height_unit = sb_mi_size >> ss_y;
-      return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
+      *px_bottom_left =
+          AOMMIN(plane_sb_height - plane_bottom_row, px_bl_common);
+
+      return *px_bottom_left > 0;
+    } else if (bl_mask_row >= sb_mi_size) {
+      return 0;
+    } else {
+      const int bl_offset = bl_mask_row * xd->is_mi_coded_stride + bl_mask_col;
+      // As long as there is one bottom-left mi available, we determine bl is
+      // available
+#if CONFIG_SDP
+      int has_bl = xd->is_mi_coded[av1_get_sdp_idx(xd->tree_type)][bl_offset];
+#else
+      int has_bl = xd->is_mi_coded[bl_offset];
+#endif  // CONFIG_SDP
+
+      // Calculate px_bottom_left: how many bottom-left pixels are available. If
+      // it is less than tx_size_high[txsz], px_bottom_left will be used to
+      // determine the location of the last available pixel, which will be used
+      // for padding.
+      if (has_bl) {
+        int mi_bl = 0;
+        for (int i = 0; i < bottom_left_count_unit << ss_y; ++i) {
+          if ((bl_mask_row + i) >= sb_mi_size ||
+#if CONFIG_SDP
+              !xd->is_mi_coded[av1_get_sdp_idx(xd->tree_type)]
+                              [bl_offset + i * xd->is_mi_coded_stride]
+#else
+              !xd->is_mi_coded[bl_offset + i * xd->is_mi_coded_stride]
+#endif  // CONFIG_SDP
+          ) {
+            break;
+          } else {
+            mi_bl++;
+          }
+        }
+
+        *px_bottom_left = AOMMIN((mi_bl << MI_SIZE_LOG2) >> ss_y, px_bl_common);
+      }
+
+      return has_bl;
     }
-
-    // Bottom row of superblock (and not the leftmost column): so bottom-left
-    // pixels fall in the bottom superblock, which is not available yet.
-    if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0;
-
-    // General case (neither leftmost column nor bottom row): check if the
-    // bottom-left block is coded before the current block.
-    const int this_blk_index =
-        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
-        blk_col_in_sb + 0;
-    const int idx1 = this_blk_index / 8;
-    const int idx2 = this_blk_index % 8;
-    const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
-    return (has_bl_table[idx1] >> idx2) & 1;
   }
 }
 
@@ -1838,56 +1842,8 @@
 #endif
 }
 
-static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
-                                            int subsampling_y) {
-  assert(subsampling_x >= 0 && subsampling_x < 2);
-  assert(subsampling_y >= 0 && subsampling_y < 2);
-  BLOCK_SIZE bs = bsize;
-  switch (bsize) {
-    case BLOCK_4X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X8;
-      break;
-    case BLOCK_4X8:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X8;
-      break;
-    case BLOCK_8X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_8X8;
-      break;
-    case BLOCK_4X16:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X16;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X16;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X16;
-      break;
-    case BLOCK_16X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_16X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_16X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_16X8;
-      break;
-    default: break;
-  }
-  return bs;
-}
+#define ARITHMETIC_LEFT_SHIFT(x, shift) \
+  (((x) >= 0) ? ((x) << (shift)) : (-((-(x)) << (shift))))
 
 void av1_predict_intra_block(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
@@ -1940,35 +1896,51 @@
       col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
   const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
   const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+#if CONFIG_SDP
+  BLOCK_SIZE bsize = mbmi->sb_type[plane > 0];
+#else   // CONFIG_SDP
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#endif  // CONFIG_SDP
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
 
   // Distance between the right edge of this prediction block to
-  // the frame right edge
-  const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx;
+  // the tile right edge
+  const int xr =
+      ARITHMETIC_LEFT_SHIFT(xd->tile.mi_col_end - mi_col - mi_wide, 2 - ss_x) +
+      wpx - x - txwpx;
   // Distance between the bottom edge of this prediction block to
-  // the frame bottom edge
-  const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx;
+  // the tile bottom edge
+  const int yd =
+      ARITHMETIC_LEFT_SHIFT(xd->tile.mi_row_end - mi_row - mi_high, 2 - ss_y) +
+      hpx - y - txhpx;
   const int right_available =
       mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
   const int bottom_available =
       (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end);
 
-  const PARTITION_TYPE partition = mbmi->partition;
-#if CONFIG_SDP
-  BLOCK_SIZE bsize = mbmi->sb_type[plane > 0];
-#else
-  BLOCK_SIZE bsize = mbmi->sb_type;
-#endif
+  const BLOCK_SIZE init_bsize = bsize;
   // force 4x4 chroma component block size.
   if (ss_x || ss_y) {
-    bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+    bsize = mbmi->chroma_ref_info.bsize_base;
   }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  int px_top_right = 0;
+  const int have_top_right = has_top_right(
+      cm, xd, bsize, mi_row, mi_col, have_top, right_available, tx_size,
+      row_off, col_off, ss_x, ss_y, xr, &px_top_right, bsize != init_bsize);
+#else
+  const PARTITION_TYPE partition = mbmi->partition;
   const int have_top_right =
       has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
                     partition, tx_size, row_off, col_off, ss_x, ss_y);
-  const int have_bottom_left =
-      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
-                      partition, tx_size, row_off, col_off, ss_x, ss_y);
+#endif
+
+  int px_bottom_left = 0;
+  const int have_bottom_left = has_bottom_left(
+      cm, xd, bsize, mi_row, mi_col, bottom_available, have_left, tx_size,
+      row_off, col_off, ss_x, ss_y, yd, &px_bottom_left, bsize != init_bsize);
 
   const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
 
@@ -1982,9 +1954,13 @@
         xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
         filter_intra_mode, tx_size, disable_edge_filter,
         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+#if CONFIG_EXT_RECUR_PARTITIONS
+        have_top_right ? px_top_right : 0,
+#else
         have_top_right ? AOMMIN(txwpx, xr) : 0,
+#endif
         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-        have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane
+        have_bottom_left ? px_bottom_left : 0, plane
 #if CONFIG_MRLS
         ,
         is_sb_boundary
@@ -2001,9 +1977,13 @@
       xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
       filter_intra_mode, tx_size, disable_edge_filter,
       have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+#if CONFIG_EXT_RECUR_PARTITIONS
+      have_top_right ? px_top_right : 0,
+#else
       have_top_right ? AOMMIN(txwpx, xr) : 0,
+#endif
       have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-      have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane
+      have_bottom_left ? px_bottom_left : 0, plane
 #if CONFIG_MRLS
       ,
       is_sb_boundary
@@ -2015,6 +1995,8 @@
   );
 }
 
+#undef ARITHMETIC_LEFT_SHIFT
+
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int plane, int blk_col, int blk_row,
                                     TX_SIZE tx_size) {
@@ -2062,13 +2044,18 @@
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
 #if CONFIG_DEBUG
     assert(is_cfl_allowed(xd));
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+    const BLOCK_SIZE plane_bsize = get_mb_plane_block_size(
+        xd, mbmi, plane, pd->subsampling_x, pd->subsampling_y);
+#else
     const BLOCK_SIZE plane_bsize = get_plane_block_size(
 #if CONFIG_SDP
         mbmi->sb_type[xd->tree_type == CHROMA_PART], pd->subsampling_x,
         pd->subsampling_y);
 #else
-        mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
-#endif
+        mbmi->chroma_ref_info.bsize_base, pd->subsampling_x, pd->subsampling_y);
+#endif  // CONFIG_SDP
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
     (void)plane_bsize;
     assert(plane_bsize < BLOCK_SIZES_ALL);
     if (!xd->lossless[mbmi->segment_id]) {
@@ -2102,8 +2089,7 @@
     if (xd->tree_type == CHROMA_PART) {
       const int luma_tx_size =
           av1_get_max_uv_txsize(mbmi->sb_type[PLANE_TYPE_UV], 0, 0);
-      cfl_store_tx(xd, blk_row, blk_col, luma_tx_size,
-                   mbmi->sb_type[PLANE_TYPE_UV]);
+      cfl_store_tx(xd, blk_row, blk_col, luma_tx_size);
     }
 #endif
     cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index a0e6e9a..0ccac73 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -68,6 +68,7 @@
   return mode >= V_PRED && mode <= D67_PRED;
 }
 
+// TODO(any): Verify the correct behavior when we have BLOCK_4X16
 static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
   return bsize >= BLOCK_8X8;
 }
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index f3c8795..88615c4 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -287,8 +287,8 @@
              mi_col += MAX_MIB_SIZE) {
           c = mi_col >> MAX_MIB_SIZE_LOG2;
 
-          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
-                               mi_row, mi_col, plane, plane + 1);
+          av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col, plane,
+                               plane + 1, NULL);
 
           av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
                                       mi_col);
@@ -307,8 +307,8 @@
           // completed
           sync_read(lf_sync, r + 1, c, plane);
 
-          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
-                               mi_row, mi_col, plane, plane + 1);
+          av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col, plane,
+                               plane + 1, NULL);
           av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
                                       mi_col);
         }
@@ -354,8 +354,8 @@
              mi_col += MI_SIZE_64X64) {
           c = mi_col >> MIN_MIB_SIZE_LOG2;
 
-          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+          av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col, plane,
+                               plane + 1, NULL);
 
           av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row,
                                               mi_col);
@@ -374,8 +374,8 @@
           // completed
           sync_read(lf_sync, r + 1, c, plane);
 
-          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+          av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col, plane,
+                               plane + 1, NULL);
           av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row,
                                               mi_col);
         }
@@ -510,8 +510,7 @@
 
       // TODO(chengchen): can we remove this?
       struct macroblockd_plane *pd = xd->plane;
-      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane,
-                           plane + 1);
+      av1_setup_dst_planes(pd, frame, 0, 0, plane, plane + 1, NULL);
 
       av1_build_bitmask_vert_info(cm, &pd[plane], plane);
       av1_build_bitmask_horz_info(cm, &pd[plane], plane);
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index a3a8857..e7cbb48 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -280,7 +280,7 @@
           const __m128i res_8 =
               _mm_packus_epi16(round_result_lo, round_result_hi);
 
-          _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+          _mm_storeu_si128((__m128i *)(&dst0[j]), res_8);
         } else {
           _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
           _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
diff --git a/av1/common/x86/highbd_convolve_2d_sse4.c b/av1/common/x86/highbd_convolve_2d_sse4.c
index 185bdd6..122bf73 100644
--- a/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -90,7 +90,7 @@
               _mm_packus_epi32(round_result_lo, round_result_hi);
           const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
 
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
         } else {
           const __m128i res_unsigned_16b =
               _mm_adds_epu16(res, offset_const_16b);
@@ -402,7 +402,7 @@
                 _mm_packus_epi32(round_result_lo, round_result_hi);
             const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
 
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
           } else {
             const __m128i res_16b =
                 _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 7c32b49..876b9e6 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -1303,14 +1303,14 @@
 
   round_shift_8x8(in, shift);
 
-  v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
-  v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
-  v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
-  v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
-  v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
-  v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
-  v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
-  v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+  v0 = _mm_loadu_si128((__m128i const *)(output + 0 * stride));
+  v1 = _mm_loadu_si128((__m128i const *)(output + 1 * stride));
+  v2 = _mm_loadu_si128((__m128i const *)(output + 2 * stride));
+  v3 = _mm_loadu_si128((__m128i const *)(output + 3 * stride));
+  v4 = _mm_loadu_si128((__m128i const *)(output + 4 * stride));
+  v5 = _mm_loadu_si128((__m128i const *)(output + 5 * stride));
+  v6 = _mm_loadu_si128((__m128i const *)(output + 6 * stride));
+  v7 = _mm_loadu_si128((__m128i const *)(output + 7 * stride));
 
   if (flipud) {
     u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
@@ -1332,14 +1332,14 @@
     u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
   }
 
-  _mm_store_si128((__m128i *)(output + 0 * stride), u0);
-  _mm_store_si128((__m128i *)(output + 1 * stride), u1);
-  _mm_store_si128((__m128i *)(output + 2 * stride), u2);
-  _mm_store_si128((__m128i *)(output + 3 * stride), u3);
-  _mm_store_si128((__m128i *)(output + 4 * stride), u4);
-  _mm_store_si128((__m128i *)(output + 5 * stride), u5);
-  _mm_store_si128((__m128i *)(output + 6 * stride), u6);
-  _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+  _mm_storeu_si128((__m128i *)(output + 0 * stride), u0);
+  _mm_storeu_si128((__m128i *)(output + 1 * stride), u1);
+  _mm_storeu_si128((__m128i *)(output + 2 * stride), u2);
+  _mm_storeu_si128((__m128i *)(output + 3 * stride), u3);
+  _mm_storeu_si128((__m128i *)(output + 4 * stride), u4);
+  _mm_storeu_si128((__m128i *)(output + 5 * stride), u5);
+  _mm_storeu_si128((__m128i *)(output + 6 * stride), u6);
+  _mm_storeu_si128((__m128i *)(output + 7 * stride), u7);
 }
 
 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c
index fa27571..0494c0a 100644
--- a/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -93,7 +93,8 @@
               _mm256_packus_epi32(round_result_lo, round_result_hi);
           const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
 
-          _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          _mm256_storeu_si256((__m256i *)(&dst0[i * dst_stride0 + j]),
+                              res_clip);
         } else {
           const __m256i res_unsigned_16b =
               _mm256_adds_epu16(res, offset_const_16b);
@@ -200,8 +201,8 @@
             const __m128i res_0 = _mm256_castsi256_si128(res_clip);
             const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
 
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_store_si128(
+            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storeu_si128(
                 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
           } else {
             const __m256i res_unsigned_16b =
@@ -209,9 +210,9 @@
             const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
             const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
 
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
+            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
           }
         }
       }
@@ -425,8 +426,8 @@
             const __m128i res_0 = _mm256_castsi256_si128(res_clip);
             const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
 
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_store_si128(
+            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storeu_si128(
                 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
           } else {
             __m256i res_16b =
@@ -434,9 +435,9 @@
             const __m128i res_0 = _mm256_castsi256_si128(res_16b);
             const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
 
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
+            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
           }
         }
 
@@ -598,18 +599,18 @@
           const __m128i res_0 = _mm256_castsi256_si128(res_clip);
           const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
 
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
-                          res_1);
+          _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_storeu_si128(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
         } else {
           __m256i res_16b =
               _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
           const __m128i res_0 = _mm256_castsi256_si128(res_16b);
           const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
 
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_1);
+          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
         }
       }
     }
@@ -809,8 +810,8 @@
             const __m128i res_0 = _mm256_castsi256_si128(res_clip);
             const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
 
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_store_si128(
+            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storeu_si128(
                 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
           } else {
             __m256i res_16b =
@@ -818,9 +819,9 @@
             const __m128i res_0 = _mm256_castsi256_si128(res_16b);
             const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
 
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
+            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
           }
         }
         s[0] = s[1];
diff --git a/av1/common/x86/highbd_jnt_convolve_sse4.c b/av1/common/x86/highbd_jnt_convolve_sse4.c
index 00ff22f..7c0f90f 100644
--- a/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -217,9 +217,9 @@
             const __m128i res_clip_1 =
                 _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
 
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
-                            res_clip_0);
-            _mm_store_si128(
+            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                             res_clip_0);
+            _mm_storeu_si128(
                 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
                 res_clip_1);
           } else {
@@ -227,9 +227,9 @@
                 _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
             __m128i res_16bit1 =
                 _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_16bit1);
+            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_16bit1);
           }
         }
         s[0] = s[1];
@@ -366,10 +366,10 @@
           const __m128i res_16b =
               _mm_packus_epi32(round_result_lo, round_result_hi);
           const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
         } else {
           __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
         }
       }
     }
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index c42ebc1..586b3e4 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -369,8 +369,8 @@
             const __m128i res_0 = _mm256_castsi256_si128(res_8);
             const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_store_si128(
+            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storeu_si128(
                 (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
 
           } else {
@@ -548,8 +548,8 @@
             const __m128i res_0 = _mm256_castsi256_si128(res_8);
             const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_store_si128(
+            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storeu_si128(
                 (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
 
           } else {
@@ -840,8 +840,8 @@
           const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
           const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
 
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
-                          _mm256_castsi256_si128(res_0));
+          _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                           _mm256_castsi256_si128(res_0));
         } else {
           _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
                              res_unsigned);
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 5e25082..b43e0a9 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <stddef.h>
 
+#include "av1/common/blockd.h"
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
@@ -253,11 +254,7 @@
 #else
   if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
 #endif
-#if CONFIG_SDP
-    cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type[AOM_PLANE_Y]);
-#else
-    cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
-#endif
+    cfl_store_tx(xd, row, col, tx_size);
   }
 }
 
@@ -307,7 +304,14 @@
   DecoderCodingBlock *const dcb = &td->dcb;
   MACROBLOCKD *const xd = &dcb->xd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+  const BLOCK_SIZE bsize_base = get_bsize_base(xd, mbmi, plane);
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(bsize_base, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+#elif CONFIG_SDP
   if (xd->tree_type == SHARED_PART)
     assert(mbmi->sb_type[PLANE_TYPE_Y] == mbmi->sb_type[PLANE_TYPE_UV]);
   const TX_SIZE plane_tx_size =
@@ -317,8 +321,8 @@
                                                          blk_col)];
 #else
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
-                                    pd->subsampling_y)
+      plane ? av1_get_max_uv_txsize(mbmi->chroma_ref_info.bsize_base,
+                                    pd->subsampling_x, pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
 #endif
@@ -393,7 +397,8 @@
 
 static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                   int bw, int bh, int x_mis, int y_mis) {
+                                   int bw, int bh, int x_mis, int y_mis,
+                                   PARTITION_TREE *parent, int index) {
   const int num_planes = av1_num_planes(cm);
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const TileInfo *const tile = &xd->tile;
@@ -423,25 +428,31 @@
   }
 #endif
 
-  set_plane_n4(xd, bw, bh, num_planes);
-  set_entropy_context(xd, mi_row, mi_col, num_planes);
+  CHROMA_REF_INFO *chr_ref_info = &xd->mi[0]->chroma_ref_info;
+  set_chroma_ref_info(mi_row, mi_col, index, bsize, chr_ref_info,
+                      parent ? &parent->chroma_ref_info : NULL,
+                      parent ? parent->bsize : BLOCK_INVALID,
+                      parent ? parent->partition : PARTITION_NONE,
+                      xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+  set_plane_n4(xd, bw, bh, num_planes, chr_ref_info);
+  set_entropy_context(xd, mi_row, mi_col, num_planes, chr_ref_info);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
-                 mi_params->mi_cols);
+                 mi_params->mi_cols, chr_ref_info);
 
-  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
-                       num_planes);
+  av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes, chr_ref_info);
 }
 
 static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
                                          DecoderCodingBlock *dcb, int mi_row,
                                          int mi_col, aom_reader *r,
                                          PARTITION_TYPE partition,
-                                         BLOCK_SIZE bsize) {
+                                         BLOCK_SIZE bsize,
+                                         PARTITION_TREE *parent, int index) {
   AV1_COMMON *const cm = &pbi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
@@ -451,18 +462,27 @@
 #if CONFIG_ACCOUNTING
   aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
 #endif
-  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis, parent,
+              index);
   xd->mi[0]->partition = partition;
   av1_read_mode_info(pbi, dcb, r, x_mis, y_mis);
-  if (bsize >= BLOCK_8X8 &&
-      (seq_params->subsampling_x || seq_params->subsampling_y)) {
-    const BLOCK_SIZE uv_subsize =
-        ss_size_lookup[bsize][seq_params->subsampling_x]
-                      [seq_params->subsampling_y];
-    if (uv_subsize == BLOCK_INVALID)
+
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  if (xd->tree_type != LUMA_PART) {
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+    const struct macroblockd_plane *const pd_u = &xd->plane[1];
+    const BLOCK_SIZE chroma_bsize_base = xd->mi[0]->chroma_ref_info.bsize_base;
+    assert(chroma_bsize_base < BLOCK_SIZES_ALL);
+    if (get_plane_block_size(chroma_bsize_base, pd_u->subsampling_x,
+                             pd_u->subsampling_y) == BLOCK_INVALID) {
       aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
-                         "Invalid block size.");
+                         "Block size %dx%d invalid with this subsampling mode",
+                         block_size_wide[chroma_bsize_base],
+                         block_size_high[chroma_bsize_base]);
+    }
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
   }
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
 }
 
 typedef struct PadBlock {
@@ -903,13 +923,8 @@
                                      dst_stride2);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
-#if CONFIG_SDP
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type[PLANE_TYPE_Y],
-                       &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes);
-#else
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
-                       mi_row, mi_col, 0, num_planes);
-#endif
+  av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes, &xd->mi[0]->chroma_ref_info);
   av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
                                   dst_stride2);
 }
@@ -955,7 +970,8 @@
 
       xd->block_ref_scale_factors[ref] = ref_scale_factors;
       av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col,
-                           ref_scale_factors, num_planes);
+                           ref_scale_factors, num_planes,
+                           &mbmi->chroma_ref_info);
     }
   }
 
@@ -969,9 +985,7 @@
     int pixel_c, pixel_r;
     mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
                     pd->subsampling_y);
-    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                             pd->subsampling_y))
-      continue;
+    if (plane && !xd->is_chroma_ref) continue;
     mismatch_check_block_pre(pd->dst.buf, pd->dst.stride,
                              cm->current_frame.order_hint, plane, pixel_c,
                              pixel_r, pd->width, pd->height,
@@ -1094,8 +1108,16 @@
             const struct macroblockd_plane *const pd = &xd->plane[plane];
             const int ss_x = pd->subsampling_x;
             const int ss_y = pd->subsampling_y;
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+            const BLOCK_SIZE plane_bsize =
+                get_mb_plane_block_size(xd, mbmi, plane, ss_x, ss_y);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+            assert(plane_bsize == get_plane_block_size(bsize, ss_x, ss_y));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+#else
             const BLOCK_SIZE plane_bsize =
                 get_plane_block_size(bsize, ss_x, ss_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
             const TX_SIZE max_tx_size =
                 get_vartx_max_txsize(xd, plane_bsize, plane);
             const int bh_var_tx = tx_size_high_unit[max_tx_size];
@@ -1127,6 +1149,7 @@
   }
 
   av1_visit_palette(pbi, xd, r, set_color_index_map_offset);
+  av1_mark_block_as_coded(xd, bsize, cm->seq_params.sb_size);
 }
 
 static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
@@ -1411,10 +1434,12 @@
                                           ThreadData *const td, int mi_row,
                                           int mi_col, aom_reader *r,
                                           PARTITION_TYPE partition,
-                                          BLOCK_SIZE bsize) {
+                                          BLOCK_SIZE bsize,
+                                          PARTITION_TREE *parent, int index) {
   DecoderCodingBlock *const dcb = &td->dcb;
   MACROBLOCKD *const xd = &dcb->xd;
-  decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize);
+  decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize, parent,
+                    index);
 
   av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens);
 
@@ -1533,11 +1558,12 @@
     }
   }
 #if CONFIG_SDP
+  assert(bsize == mbmi->sb_type[av1_get_sdp_idx(xd->tree_type)]);
   if (mbmi->skip_txfm[xd->tree_type == CHROMA_PART])
-    av1_reset_entropy_context(xd, bsize, num_planes);
 #else
-  if (mbmi->skip_txfm) av1_reset_entropy_context(xd, bsize, num_planes);
-#endif
+  if (mbmi->skip_txfm)
+#endif  // CONFIG_SDP
+    av1_reset_entropy_context(xd, bsize, num_planes);
 #if CONFIG_SDP
   decode_token_recon_block(pbi, td, r, partition, bsize);
 #else
@@ -1587,10 +1613,9 @@
 #endif
 }
 
-static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
-                                                      ThreadData *const td,
-                                                      int mi_row, int mi_col,
-                                                      BLOCK_SIZE bsize) {
+static AOM_INLINE void set_offsets_for_pred_and_recon(
+    AV1Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, PARTITION_TREE *parent, int index) {
   AV1_COMMON *const cm = &pbi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   DecoderCodingBlock *const dcb = &td->dcb;
@@ -1607,23 +1632,29 @@
       &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col];
   xd->tx_type_map_stride = mi_params->mi_stride;
 
-  set_plane_n4(xd, bw, bh, num_planes);
+  CHROMA_REF_INFO *chr_ref_info = &xd->mi[0]->chroma_ref_info;
+  set_chroma_ref_info(mi_row, mi_col, index, bsize, chr_ref_info,
+                      parent ? &parent->chroma_ref_info : NULL,
+                      parent ? parent->bsize : BLOCK_INVALID,
+                      parent ? parent->partition : PARTITION_NONE,
+                      xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+  set_plane_n4(xd, bw, bh, num_planes, chr_ref_info);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
-                 mi_params->mi_cols);
+                 mi_params->mi_cols, chr_ref_info);
 
-  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
-                       num_planes);
+  av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes, chr_ref_info);
 }
 
 static AOM_INLINE void decode_block(AV1Decoder *const pbi, ThreadData *const td,
                                     int mi_row, int mi_col, aom_reader *r,
-                                    PARTITION_TYPE partition,
-                                    BLOCK_SIZE bsize) {
+                                    PARTITION_TYPE partition, BLOCK_SIZE bsize,
+                                    PARTITION_TREE *parent, int index) {
   (void)partition;
-  set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
+  set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize, parent, index);
 #if CONFIG_SDP
   decode_token_recon_block(pbi, td, r, partition, bsize);
 #else
@@ -1635,15 +1666,65 @@
 static PARTITION_TYPE read_partition(const AV1_COMMON *const cm,
                                      MACROBLOCKD *xd, int mi_row, int mi_col,
                                      aom_reader *r, int has_rows, int has_cols,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                                     PARTITION_TREE *ptree_luma,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
                                      BLOCK_SIZE bsize) {
 #else
 static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      aom_reader *r, int has_rows, int has_cols,
                                      BLOCK_SIZE bsize) {
 #endif
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (!is_partition_point(bsize)) return PARTITION_NONE;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_SDP
+  const int plane = xd->tree_type == CHROMA_PART;
+  if (plane == 1 && bsize == BLOCK_8X8) {
+    return PARTITION_NONE;
+  }
+  const int min_bsize_1d =
+      AOMMIN(block_size_high[bsize], block_size_wide[bsize]);
+  if (plane && min_bsize_1d >= SHARED_PART_SIZE) {
+    const int ssx = cm->seq_params.subsampling_x;
+    const int ssy = cm->seq_params.subsampling_y;
+    if (ptree_luma)
+      return sdp_chroma_part_from_luma(bsize, ptree_luma->partition, ssx, ssy);
+  }
+#endif  // CONFIG_SDP
+
+  if (is_square_block(bsize)) {
+    if (!has_rows && has_cols) return PARTITION_HORZ;
+    if (has_rows && !has_cols) return PARTITION_VERT;
+
+    assert(ctx >= 0);
+    if (has_rows && has_cols) {
+#if CONFIG_SDP
+      aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[plane][ctx];
+#else
+      aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
+#endif  // CONFIG_SDP
+
+      return (PARTITION_TYPE)aom_read_symbol(
+          r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
+    } else {  // !has_rows && !has_cols
+      aom_cdf_prob cdf[2] = { 16384, AOM_ICDF(CDF_PROB_TOP) };
+      return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_VERT
+                                               : PARTITION_HORZ;
+    }
+  } else {
+    aom_cdf_prob *partition_rec_cdf = ec_ctx->partition_rec_cdf[ctx];
+    const PARTITION_TYPE_REC symbol = (PARTITION_TYPE_REC)aom_read_symbol(
+        r, partition_rec_cdf, partition_rec_cdf_length(bsize), ACCT_STR);
+
+    return get_partition_from_symbol_rec_block(bsize, symbol);
+  }
+#else  // !CONFIG_EXT_RECUR_PARTITIONS
   if (!has_rows && !has_cols) return PARTITION_SPLIT;
 
 #if CONFIG_SDP
@@ -1653,12 +1734,12 @@
   }
   int parent_block_width = block_size_wide[bsize];
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  if (xd->tree_type == CHROMA_PART && parent_block_width >= SHARED_PART_SIZE) {
+  if (plane && parent_block_width >= SHARED_PART_SIZE) {
     int luma_split_flag = get_luma_split_flag(bsize, mi_params, mi_row, mi_col);
     // if luma blocks uses smaller blocks, then chroma will also split
     if (luma_split_flag > 3) return PARTITION_SPLIT;
   }
-#endif
+#endif  // CONFIG_SDP
 
   assert(ctx >= 0);
 #if CONFIG_SDP
@@ -1683,26 +1764,31 @@
     assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
     return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
   }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
                                         ThreadData *const td, int mi_row,
                                         int mi_col, aom_reader *reader,
-                                        BLOCK_SIZE bsize,
+                                        BLOCK_SIZE bsize, PARTITION_TREE *ptree,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                                        PARTITION_TREE *ptree_luma,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
                                         int parse_decode_flag) {
   assert(bsize < BLOCK_SIZES_ALL);
   AV1_COMMON *const cm = &pbi->common;
   DecoderCodingBlock *const dcb = &td->dcb;
   MACROBLOCKD *const xd = &dcb->xd;
-  const int bw = mi_size_wide[bsize];
-  const int hbs = bw >> 1;
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
+  const int hbs_w = mi_size_wide[bsize] / 2;
+  const int hbs_h = mi_size_high[bsize] / 2;
+  const int qbs_w = mi_size_wide[bsize] / 4;
+  const int qbs_h = mi_size_high[bsize] / 4;
   PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  const int quarter_step = bw / 4;
-  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
-  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
+  const int has_rows = (mi_row + hbs_h) < cm->mi_params.mi_rows;
+  const int has_cols = (mi_col + hbs_w) < cm->mi_params.mi_cols;
 
   if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
     return;
@@ -1737,24 +1823,63 @@
       }
     }
 #if CONFIG_SDP
-    partition = (bsize < BLOCK_8X8)
-                    ? PARTITION_NONE
-                    : read_partition(cm, xd, mi_row, mi_col, reader, has_rows,
-                                     has_cols, bsize);
-#else
-    partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
-                                    : read_partition(xd, mi_row, mi_col, reader,
-                                                     has_rows, has_cols, bsize);
-#endif
-  } else {
-#if CONFIG_SDP
     partition =
-        get_partition(cm, xd->tree_type == CHROMA_PART, mi_row, mi_col, bsize);
+        !is_partition_point(bsize)
+            ? PARTITION_NONE
+            : read_partition(cm, xd, mi_row, mi_col, reader, has_rows, has_cols,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                             ptree_luma,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                             bsize);
 #else
-    partition = get_partition(cm, mi_row, mi_col, bsize);
+    partition = !is_partition_point(bsize)
+                    ? PARTITION_NONE
+                    : read_partition(xd, mi_row, mi_col, reader, has_rows,
+                                     has_cols, bsize);
 #endif
+
+    ptree->partition = partition;
+    ptree->bsize = bsize;
+    ptree->mi_row = mi_row;
+    ptree->mi_col = mi_col;
+    ptree->is_settled = 1;
+    PARTITION_TREE *parent = ptree->parent;
+    set_chroma_ref_info(
+        mi_row, mi_col, ptree->index, bsize, &ptree->chroma_ref_info,
+        parent ? &parent->chroma_ref_info : NULL,
+        parent ? parent->bsize : BLOCK_INVALID,
+        parent ? parent->partition : PARTITION_NONE, ss_x, ss_y);
+
+    switch (partition) {
+      case PARTITION_SPLIT:
+        ptree->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+        ptree->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+        ptree->sub_tree[2] = av1_alloc_ptree_node(ptree, 2);
+        ptree->sub_tree[3] = av1_alloc_ptree_node(ptree, 3);
+        break;
+#if CONFIG_EXT_RECUR_PARTITIONS
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        ptree->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+        ptree->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+        break;
+      case PARTITION_HORZ_3:
+      case PARTITION_VERT_3:
+        ptree->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+        ptree->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+        ptree->sub_tree[2] = av1_alloc_ptree_node(ptree, 2);
+        break;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      default: break;
+    }
+  } else {
+    partition = ptree->partition;
   }
-  subsize = get_partition_subsize(bsize, partition);
+#if CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+  const int track_ptree_luma =
+      ptree_luma ? (partition == ptree_luma->partition) : 0;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   if (subsize == BLOCK_INVALID) {
     aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Partition is invalid for block size %dx%d",
@@ -1772,63 +1897,115 @@
 
 #define DEC_BLOCK_STX_ARG
 #define DEC_BLOCK_EPT_ARG partition,
-#define DEC_BLOCK(db_r, db_c, db_subsize)                                  \
-  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
-                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
-#define DEC_PARTITION(db_r, db_c, db_subsize)                        \
+#define DEC_BLOCK(db_r, db_c, db_subsize, index)                               \
+  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c),     \
+                                 reader, DEC_BLOCK_EPT_ARG(db_subsize), ptree, \
+                                 index)
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+#define DEC_PARTITION(db_r, db_c, db_subsize, index)                      \
+  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader,      \
+                   (db_subsize), ptree->sub_tree[(index)],                \
+                   track_ptree_luma ? ptree_luma->sub_tree[index] : NULL, \
+                   parse_decode_flag)
+#else  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+#define DEC_PARTITION(db_r, db_c, db_subsize, index)                 \
   decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
-                   (db_subsize), parse_decode_flag)
+                   (db_subsize), ptree->sub_tree[(index)], parse_decode_flag)
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
   switch (partition) {
-    case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
+    case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize, 0); break;
     case PARTITION_HORZ:
-      DEC_BLOCK(mi_row, mi_col, subsize);
-      if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+#if CONFIG_EXT_RECUR_PARTITIONS
+      DEC_PARTITION(mi_row, mi_col, subsize, 0);
+      if ((mi_row + hbs_h) < cm->mi_params.mi_rows)
+        DEC_PARTITION(mi_row + hbs_h, mi_col, subsize, 1);
+#else
+      DEC_BLOCK(mi_row, mi_col, subsize, 0);
+      if (has_rows) DEC_BLOCK(mi_row + hbs_h, mi_col, subsize, 1);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       break;
     case PARTITION_VERT:
-      DEC_BLOCK(mi_row, mi_col, subsize);
-      if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+#if CONFIG_EXT_RECUR_PARTITIONS
+      DEC_PARTITION(mi_row, mi_col, subsize, 0);
+      if ((mi_col + hbs_w) < cm->mi_params.mi_cols)
+        DEC_PARTITION(mi_row, mi_col + hbs_w, subsize, 1);
+#else
+      DEC_BLOCK(mi_row, mi_col, subsize, 0);
+      if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs_w, subsize, 1);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       break;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_3: {
+      const BLOCK_SIZE bsize3 = get_partition_subsize(bsize, PARTITION_HORZ);
+      int this_mi_row = mi_row;
+      DEC_PARTITION(this_mi_row, mi_col, subsize, 0);
+      this_mi_row += qbs_h;
+      if (this_mi_row >= cm->mi_params.mi_rows) break;
+      DEC_PARTITION(this_mi_row, mi_col, bsize3, 1);
+      this_mi_row += 2 * qbs_h;
+      if (this_mi_row >= cm->mi_params.mi_rows) break;
+      DEC_PARTITION(this_mi_row, mi_col, subsize, 2);
+      break;
+    }
+    case PARTITION_VERT_3: {
+      const BLOCK_SIZE bsize3 = get_partition_subsize(bsize, PARTITION_VERT);
+      int this_mi_col = mi_col;
+      DEC_PARTITION(mi_row, this_mi_col, subsize, 0);
+      this_mi_col += qbs_w;
+      if (this_mi_col >= cm->mi_params.mi_cols) break;
+      DEC_PARTITION(mi_row, this_mi_col, bsize3, 1);
+      this_mi_col += 2 * qbs_w;
+      if (this_mi_col >= cm->mi_params.mi_cols) break;
+      DEC_PARTITION(mi_row, this_mi_col, subsize, 2);
+      break;
+    }
+#else
     case PARTITION_SPLIT:
-      DEC_PARTITION(mi_row, mi_col, subsize);
-      DEC_PARTITION(mi_row, mi_col + hbs, subsize);
-      DEC_PARTITION(mi_row + hbs, mi_col, subsize);
-      DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
+      DEC_PARTITION(mi_row, mi_col, subsize, 0);
+      DEC_PARTITION(mi_row, mi_col + hbs_w, subsize, 1);
+      DEC_PARTITION(mi_row + hbs_h, mi_col, subsize, 2);
+      DEC_PARTITION(mi_row + hbs_h, mi_col + hbs_w, subsize, 3);
       break;
     case PARTITION_HORZ_A:
-      DEC_BLOCK(mi_row, mi_col, bsize2);
-      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
-      DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      DEC_BLOCK(mi_row, mi_col, bsize2, 0);
+      DEC_BLOCK(mi_row, mi_col + hbs_w, bsize2, 1);
+      DEC_BLOCK(mi_row + hbs_h, mi_col, subsize, 2);
       break;
     case PARTITION_HORZ_B:
-      DEC_BLOCK(mi_row, mi_col, subsize);
-      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
-      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row, mi_col, subsize, 0);
+      DEC_BLOCK(mi_row + hbs_h, mi_col, bsize2, 1);
+      DEC_BLOCK(mi_row + hbs_h, mi_col + hbs_w, bsize2, 2);
       break;
     case PARTITION_VERT_A:
-      DEC_BLOCK(mi_row, mi_col, bsize2);
-      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
-      DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      DEC_BLOCK(mi_row, mi_col, bsize2, 0);
+      DEC_BLOCK(mi_row + hbs_h, mi_col, bsize2, 1);
+      DEC_BLOCK(mi_row, mi_col + hbs_w, subsize, 2);
       break;
     case PARTITION_VERT_B:
-      DEC_BLOCK(mi_row, mi_col, subsize);
-      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
-      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row, mi_col, subsize, 0);
+      DEC_BLOCK(mi_row, mi_col + hbs_w, bsize2, 1);
+      DEC_BLOCK(mi_row + hbs_h, mi_col + hbs_w, bsize2, 2);
       break;
     case PARTITION_HORZ_4:
       for (int i = 0; i < 4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
+        int this_mi_row = mi_row + i * qbs_h;
         if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break;
-        DEC_BLOCK(this_mi_row, mi_col, subsize);
+        DEC_BLOCK(this_mi_row, mi_col, subsize, i);
       }
       break;
     case PARTITION_VERT_4:
       for (int i = 0; i < 4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
+        int this_mi_col = mi_col + i * qbs_w;
         if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break;
-        DEC_BLOCK(mi_row, this_mi_col, subsize);
+        DEC_BLOCK(mi_row, this_mi_col, subsize, i);
       }
       break;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     default: assert(0 && "Invalid partition type");
   }
 
@@ -1875,10 +2052,26 @@
           ? 2
           : 1;
   xd->tree_type = (total_loop_num == 1 ? SHARED_PART : LUMA_PART);
-  decode_partition(pbi, td, mi_row, mi_col, reader, bsize, parse_decode_flag);
+  if (parse_decode_flag & 1) {
+    av1_reset_ptree_in_sbi(xd->sbi, xd->tree_type);
+  }
+  decode_partition(pbi, td, mi_row, mi_col, reader, bsize,
+                   td->dcb.xd.sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)],
+#if CONFIG_EXT_RECUR_PARTITIONS
+                   NULL,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                   parse_decode_flag);
   if (total_loop_num == 2) {
     xd->tree_type = CHROMA_PART;
-    decode_partition(pbi, td, mi_row, mi_col, reader, bsize, parse_decode_flag);
+    if (parse_decode_flag & 1) {
+      av1_reset_ptree_in_sbi(xd->sbi, xd->tree_type);
+    }
+    decode_partition(pbi, td, mi_row, mi_col, reader, bsize,
+                     td->dcb.xd.sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)],
+#if CONFIG_EXT_RECUR_PARTITIONS
+                     td->dcb.xd.sbi->ptree_root[0],
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                     parse_decode_flag);
     xd->tree_type = SHARED_PART;
   }
 }
@@ -3099,6 +3292,8 @@
 
   for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
        mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
+    av1_reset_is_mi_coded_map(&td->dcb.xd, cm->seq_params.mib_size);
+    td->dcb.xd.sbi = av1_get_sb_info(cm, mi_row, mi_col);
     set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row,
                   mi_col);
 
@@ -3110,7 +3305,7 @@
                         cm->seq_params.sb_size, 0x2);
 #else
     decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                     cm->seq_params.sb_size, 0x2);
+                     cm->seq_params.sb_size, td->dcb.xd.sbi->ptree_root, 0x2);
 #endif
 
     sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
@@ -3186,15 +3381,19 @@
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
          mi_col += cm->seq_params.mib_size) {
+      av1_reset_is_mi_coded_map(xd, cm->seq_params.mib_size);
+      av1_set_sb_info(cm, xd, mi_row, mi_col);
       set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0);
 #if CONFIG_SDP
       decode_partition_sb(pbi, td, mi_row, mi_col, td->bit_reader,
                           cm->seq_params.sb_size, 0x3);
 #else
+      av1_reset_ptree_in_sbi(xd->sbi);
       // Bit-stream parsing and decoding of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                       cm->seq_params.sb_size, 0x3);
+                       cm->seq_params.sb_size, xd->sbi->ptree_root, 0x3);
 #endif
+
       if (aom_reader_has_overflowed(td->bit_reader)) {
         aom_merge_corrupted_flag(&dcb->corrupted, 1);
         return;
@@ -3626,6 +3825,8 @@
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
          mi_col += cm->seq_params.mib_size) {
+      av1_reset_is_mi_coded_map(xd, cm->seq_params.mib_size);
+      av1_set_sb_info(cm, xd, mi_row, mi_col);
       set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col);
 
       // Bit-stream parsing of the superblock
@@ -3633,8 +3834,9 @@
       decode_partition_sb(pbi, td, mi_row, mi_col, td->bit_reader,
                           cm->seq_params.sb_size, 0x1);
 #else
+      av1_reset_ptree_in_sbi(xd->sbi);
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                       cm->seq_params.sb_size, 0x1);
+                       cm->seq_params.sb_size, xd->sbi->ptree_root, 0x1);
 #endif
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
@@ -5805,8 +6007,8 @@
         (cm->ccso_info.ccso_enable[0] || cm->ccso_info.ccso_enable[1]);
     uint16_t *ext_rec_y;
     if (use_ccso) {
-      av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size,
-                           &cm->cur_frame->buf, 0, 0, 0, num_planes);
+      av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, 0, 0, 0, num_planes,
+                           NULL);
       const int ccso_stride_ext =
           xd->plane[0].dst.width + (CCSO_PADDING_SIZE << 1);
       ext_rec_y =
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 9509e18..3f11d5d 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -38,10 +38,11 @@
 
 static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
 #if CONFIG_SDP
-  const int skip_txfm = xd->mi[0]->skip_txfm[xd->tree_type == CHROMA_PART];
+  assert(xd->tree_type != CHROMA_PART);
+  const int skip_txfm = xd->mi[0]->skip_txfm[0];
 #else
   const int skip_txfm = xd->mi[0]->skip_txfm;
-#endif
+#endif  // CONFIG_SDP
   if (cm->features.coded_lossless) return;
   if (cm->features.allow_intrabc) {
     assert(cm->cdef_info.cdef_bits == 0);
@@ -68,6 +69,27 @@
   const int index = (cm->seq_params.sb_size == BLOCK_128X128)
                         ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  int second_index = index;
+  const int current_grid_idx =
+      get_mi_grid_idx(&cm->mi_params, xd->mi_row, xd->mi_col);
+  const MB_MODE_INFO *const current_mbmi =
+      cm->mi_params.mi_grid_base[current_grid_idx];
+#if CONFIG_SDP
+  const BLOCK_SIZE current_bsize = current_mbmi->sb_type[0];
+#else
+  const BLOCK_SIZE current_bsize = current_mbmi->sb_type;
+#endif  // CONFIG_SDP
+  const int mi_row_end = xd->mi_row + mi_size_high[current_bsize] - 1;
+  const int mi_col_end = xd->mi_col + mi_size_wide[current_bsize] - 1;
+  if (cm->seq_params.sb_size == BLOCK_128X128 &&
+      block_size_wide[current_bsize] != 128 &&
+      block_size_high[current_bsize] != 128) {
+    const int second_cdef_unit_row_in_sb = ((mi_row_end & index_mask) != 0);
+    const int second_cdef_unit_col_in_sb = ((mi_col_end & index_mask) != 0);
+    second_index = second_cdef_unit_col_in_sb + 2 * second_cdef_unit_row_in_sb;
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   // Read CDEF strength from the first non-skip coding block in this CDEF unit.
   if (!xd->cdef_transmitted[index] && !skip_txfm) {
@@ -83,6 +105,42 @@
         aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR);
     xd->cdef_transmitted[index] = true;
   }
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (!xd->cdef_transmitted[second_index] && !skip_txfm) {
+    // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO
+    // of the 1st block in this CDEF unit.
+    const int first_block_mask = ~(cdef_size - 1);
+    CommonModeInfoParams *const mi_params = &cm->mi_params;
+    const int grid_idx =
+        get_mi_grid_idx(mi_params, mi_row_end & first_block_mask,
+                        mi_col_end & first_block_mask);
+#if CONFIG_SDP
+    assert(IMPLIES(!mi_params->mi_grid_base[grid_idx],
+                   xd->tree_type == LUMA_PART));
+    if (!mi_params->mi_grid_base[grid_idx]) {
+      const int mi_alloc_idx =
+          get_alloc_mi_idx(mi_params, mi_row_end & first_block_mask,
+                           mi_col_end & first_block_mask);
+      mi_params->mi_grid_base[grid_idx] = &mi_params->mi_alloc[mi_alloc_idx];
+    }
+#endif
+    MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+    mbmi->cdef_strength =
+        aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR);
+    xd->cdef_transmitted[second_index] = true;
+#if CONFIG_SDP
+    for (int x = 0; x < mi_size_wide[current_bsize]; x++) {
+      for (int y = 0; y < mi_size_high[current_bsize]; y++) {
+        const int mi_x = xd->mi_col + x;
+        const int mi_y = xd->mi_row + y;
+        const int idx = get_alloc_mi_idx(mi_params, mi_y, mi_x);
+        if (mi_y < mi_params->mi_rows && mi_x < mi_params->mi_cols)
+          mi_params->mi_alloc[idx].cdef_strength = mbmi->cdef_strength;
+      }
+    }
+#endif  // CONFIG_SDP
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 #if CONFIG_CCSO
@@ -1919,12 +1977,14 @@
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
 #if CONFIG_SDP
-  if (is_motion_variation_allowed_bsize(mbmi->sb_type[PLANE_TYPE_Y]) &&
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type[PLANE_TYPE_Y], xd->mi_row,
+                                        xd->mi_col) &&
       !mbmi->skip_mode && !has_second_ref(mbmi)) {
 #else
-  if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
-      !has_second_ref(mbmi)) {
-#endif
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type, xd->mi_row,
+                                        xd->mi_col) &&
+      !mbmi->skip_mode && !has_second_ref(mbmi)) {
+#endif  // CONFIG_SDP
     mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
   }
   av1_count_overlappable_neighbors(cm, xd);
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 45355ad..7a1f9a4 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -417,7 +417,8 @@
 
 typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
                                    int mi_row, int mi_col, aom_reader *r,
-                                   PARTITION_TYPE partition, BLOCK_SIZE bsize);
+                                   PARTITION_TYPE partition, BLOCK_SIZE bsize,
+                                   PARTITION_TREE *parent, int index);
 
 /*!\endcond */
 
diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c
index 2f5c152..f95bc3e 100644
--- a/av1/decoder/decodetxb.c
+++ b/av1/decoder/decodetxb.c
@@ -380,14 +380,20 @@
   MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_SDP
-  const BLOCK_SIZE bsize = mbmi->sb_type[plane > 0];
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+  const BLOCK_SIZE plane_bsize = get_mb_plane_block_size(
+      xd, mbmi, plane, pd->subsampling_x, pd->subsampling_y);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  assert(plane_bsize == get_plane_block_size(mbmi->sb_type[plane > 0],
+                                             pd->subsampling_x,
+                                             pd->subsampling_y));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 #else
   const BLOCK_SIZE bsize = mbmi->sb_type;
-#endif
   assert(bsize < BLOCK_SIZES_ALL);
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
 
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col,
diff --git a/av1/encoder/aq_complexity.c b/av1/encoder/aq_complexity.c
index 4a45fc7..824b45f 100644
--- a/av1/encoder/aq_complexity.c
+++ b/av1/encoder/aq_complexity.c
@@ -161,7 +161,8 @@
     aom_clear_system_state();
     low_var_thresh = DEFAULT_LV_THRESH;
 
-    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
+    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes,
+                         &mb->e_mbd.mi[0]->chroma_ref_info);
     logvar = av1_log_block_var(cpi, mb, bs);
 
     segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index d830a00..fc6d529 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -21,6 +21,7 @@
 #include "aom_ports/bitops.h"
 #include "aom_ports/mem_ops.h"
 #include "aom_ports/system_state.h"
+#include "av1/common/blockd.h"
 #if CONFIG_BITSTREAM_DEBUG
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG
@@ -515,7 +516,14 @@
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_SDP
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE bsize_base = get_bsize_base(xd, mbmi, plane);
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(bsize_base, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+#elif CONFIG_SDP
   const TX_SIZE plane_tx_size =
       plane ? av1_get_max_uv_txsize(mbmi->sb_type[plane > 0], pd->subsampling_x,
                                     pd->subsampling_y)
@@ -523,8 +531,8 @@
                                                          blk_col)];
 #else
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
-                                    pd->subsampling_y)
+      plane ? av1_get_max_uv_txsize(mbmi->chroma_ref_info.bsize_base,
+                                    pd->subsampling_x, pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
 #endif
@@ -1210,6 +1218,28 @@
   const int index = (cm->seq_params.sb_size == BLOCK_128X128)
                         ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  int second_index = index;
+  const int current_grid_idx =
+      get_mi_grid_idx(&cm->mi_params, xd->mi_row, xd->mi_col);
+  const MB_MODE_INFO *const current_mbmi =
+      cm->mi_params.mi_grid_base[current_grid_idx];
+#if CONFIG_SDP
+  assert(xd->tree_type != CHROMA_PART);
+  const BLOCK_SIZE current_bsize = current_mbmi->sb_type[0];
+#else
+  const BLOCK_SIZE current_bsize = current_mbmi->sb_type;
+#endif  // CONFIG_SDP
+  const int mi_row_end = xd->mi_row + mi_size_high[current_bsize] - 1;
+  const int mi_col_end = xd->mi_col + mi_size_wide[current_bsize] - 1;
+  if (cm->seq_params.sb_size == BLOCK_128X128 &&
+      block_size_wide[current_bsize] != 128 &&
+      block_size_high[current_bsize] != 128) {
+    const int second_cdef_unit_row_in_sb = ((mi_row_end & index_mask) != 0);
+    const int second_cdef_unit_col_in_sb = ((mi_col_end & index_mask) != 0);
+    second_index = second_cdef_unit_col_in_sb + 2 * second_cdef_unit_row_in_sb;
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   // Write CDEF strength to the first non-skip coding block in this CDEF unit.
   if (!xd->cdef_transmitted[index] && !skip) {
@@ -1224,6 +1254,20 @@
     aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits);
     xd->cdef_transmitted[index] = true;
   }
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (!xd->cdef_transmitted[second_index] && !skip) {
+    // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO
+    // of the 1st block in this CDEF unit.
+    const int first_block_mask = ~(cdef_size - 1);
+    const CommonModeInfoParams *const mi_params = &cm->mi_params;
+    const int grid_idx =
+        get_mi_grid_idx(mi_params, mi_row_end & first_block_mask,
+                        mi_col_end & first_block_mask);
+    const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+    aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits);
+    xd->cdef_transmitted[second_index] = true;
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 #if CONFIG_CCSO
@@ -1903,16 +1947,21 @@
     const int plane) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_SDP
-  const BLOCK_SIZE bsize = mbmi->sb_type[PLANE_TYPE_Y];
-#else
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-#endif
-  assert(bsize < BLOCK_SIZES_ALL);
   const int ss_x = pd->subsampling_x;
   const int ss_y = pd->subsampling_y;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+  const BLOCK_SIZE plane_bsize =
+      get_mb_plane_block_size(xd, mbmi, plane, ss_x, ss_y);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  assert(plane_bsize ==
+         get_plane_block_size(mbmi->sb_type[PLANE_TYPE_Y], ss_x, ss_y));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   assert(plane_bsize < BLOCK_SIZES_ALL);
+#else
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
   const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
   const int step =
       tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
@@ -1943,7 +1992,9 @@
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-#if CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+  const BLOCK_SIZE bsize = get_bsize_base(xd, mbmi, AOM_PLANE_Y);
+#elif CONFIG_SDP
   const BLOCK_SIZE bsize = mbmi->sb_type[xd->tree_type == CHROMA_PART];
 #else
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -2042,7 +2093,7 @@
   const int bh = mi_size_high[bsize];
   const int bw = mi_size_wide[bsize];
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
-                 mi_params->mi_cols);
+                 mi_params->mi_cols, &mbmi->chroma_ref_info);
 
   xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
   xd->left_txfm_context =
@@ -2138,33 +2189,81 @@
 #endif
     write_tokens_b(cpi, w, tok, tok_end);
   }
+
+  av1_mark_block_as_coded(xd, bsize, cm->seq_params.sb_size);
 }
 
 static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
-                                       const MACROBLOCKD *const xd, int hbs,
-                                       int mi_row, int mi_col, PARTITION_TYPE p,
-                                       BLOCK_SIZE bsize, aom_writer *w) {
-  const int is_partition_point = bsize >= BLOCK_8X8;
-
-  if (!is_partition_point) return;
+                                       const MACROBLOCKD *const xd, int mi_row,
+                                       int mi_col, PARTITION_TYPE p,
+                                       BLOCK_SIZE bsize,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                                       PARTITION_TREE *ptree_luma,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                                       aom_writer *w) {
+  if (!is_partition_point(bsize)) return;
 
 #if CONFIG_SDP
   const int plane = xd->tree_type == CHROMA_PART;
   if (bsize == BLOCK_8X8 && plane > 0) return;
-#endif
+  const int parent_block_width = block_size_wide[bsize];
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int min_bsize_1d = AOMMIN(block_size_high[bsize], parent_block_width);
+  if (xd->tree_type == CHROMA_PART && min_bsize_1d >= SHARED_PART_SIZE) {
+    const int ssx = cm->seq_params.subsampling_x;
+    const int ssy = cm->seq_params.subsampling_y;
+    (void)ssx;
+    (void)ssy;
+    if (ptree_luma) {
+      assert(p ==
+             sdp_chroma_part_from_luma(bsize, ptree_luma->partition, ssx, ssy));
+      return;
+    }
+  }
+  (void)ptree_luma;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+#endif  // CONFIG_SDP
 
-  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (is_square_block(bsize)) {
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    const int hbs_w = mi_size_wide[bsize] / 2;
+    const int hbs_h = mi_size_high[bsize] / 2;
+    const int has_rows = (mi_row + hbs_h) < cm->mi_params.mi_rows;
+    const int has_cols = (mi_col + hbs_w) < cm->mi_params.mi_cols;
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+    if (has_rows && has_cols) {
+#if CONFIG_SDP
+      aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[plane][ctx];
+#else
+      aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
+#endif  // CONFIG_SDP
+      aom_write_symbol(w, p, partition_cdf, partition_cdf_length(bsize));
+    } else if (!has_rows && has_cols) {
+      assert(p == PARTITION_HORZ);
+    } else if (has_rows && !has_cols) {
+      assert(p == PARTITION_VERT);
+    } else {
+      assert(p == PARTITION_HORZ || p == PARTITION_VERT);
+      aom_cdf_prob cdf[2] = { 16384, AOM_ICDF(CDF_PROB_TOP) };
+      aom_write_cdf(w, p == PARTITION_VERT, cdf, 2);
+    }
+  } else {  // 1:2 or 2:1 rectangular blocks
+    const PARTITION_TYPE_REC symbol =
+        get_symbol_from_partition_rec_block(bsize, p);
+    aom_write_symbol(w, symbol, ec_ctx->partition_rec_cdf[ctx],
+                     partition_rec_cdf_length(bsize));
+  }
+#else  // CONFIG_EXT_RECUR_PARTITIONS
   if (!has_rows && !has_cols) {
     assert(p == PARTITION_SPLIT);
     return;
   }
 
 #if CONFIG_SDP
-  int parent_block_width = block_size_wide[bsize];
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   if (xd->tree_type == CHROMA_PART && parent_block_width >= SHARED_PART_SIZE) {
     int luma_split_flag = get_luma_split_flag(bsize, mi_params, mi_row, mi_col);
@@ -2206,25 +2305,27 @@
 #endif
     aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
   }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 static AOM_INLINE void write_modes_sb(
     AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w,
-    const TokenExtra **tok, const TokenExtra *const tok_end, int mi_row,
-    int mi_col, BLOCK_SIZE bsize) {
+    const TokenExtra **tok, const TokenExtra *const tok_end,
+    PARTITION_TREE *ptree,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+    PARTITION_TREE *ptree_luma,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+    int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   assert(bsize < BLOCK_SIZES_ALL);
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int quarter_step = mi_size_wide[bsize] / 4;
-  int i;
-#if CONFIG_SDP
-  const PARTITION_TYPE partition =
-      get_partition(cm, xd->tree_type == CHROMA_PART, mi_row, mi_col, bsize);
-#else
-  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
-#endif
+  const int hbs_w = mi_size_wide[bsize] / 2;
+  const int hbs_h = mi_size_high[bsize] / 2;
+  const int qbs_w = mi_size_wide[bsize] / 4;
+  const int qbs_h = mi_size_high[bsize] / 4;
+  assert(ptree);
+  const PARTITION_TYPE partition = ptree->partition;
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
 
   if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
@@ -2253,65 +2354,145 @@
     }
   }
 
-  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  write_partition(cm, xd, mi_row, mi_col, partition, bsize, ptree_luma, w);
+  const int track_ptree_luma =
+      ptree_luma ? (partition == ptree_luma->partition) : 0;
+#else
+  write_partition(cm, xd, mi_row, mi_col, partition, bsize, w);
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
   switch (partition) {
     case PARTITION_NONE:
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
       break;
     case PARTITION_HORZ:
+#if CONFIG_EXT_RECUR_PARTITIONS
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[0],
+#if CONFIG_SDP
+                     track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+#endif  // CONFIG_SDP
+                     mi_row, mi_col, subsize);
+      if (mi_row + hbs_h < mi_params->mi_rows) {
+        write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[1],
+#if CONFIG_SDP
+                       track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+#endif  // CONFIG_SDP
+                       mi_row + hbs_h, mi_col, subsize);
+      }
+#else   // CONFIG_EXT_RECUR_PARTITIONS
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      if (mi_row + hbs < mi_params->mi_rows)
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      if (mi_row + hbs_h < mi_params->mi_rows)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs_h, mi_col);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       break;
     case PARTITION_VERT:
+#if CONFIG_EXT_RECUR_PARTITIONS
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[0],
+#if CONFIG_SDP
+                     track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+#endif  // CONFIG_SDP
+                     mi_row, mi_col, subsize);
+      if (mi_col + hbs_w < mi_params->mi_cols) {
+        write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[1],
+#if CONFIG_SDP
+                       track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+#endif  // CONFIG_SDP
+                       mi_row, mi_col + hbs_w, subsize);
+      }
+#else  // CONFIG_EXT_RECUR_PARTITIONS
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      if (mi_col + hbs < mi_params->mi_cols)
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      if (mi_col + hbs_w < mi_params->mi_cols)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs_w);
+#endif
       break;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_3:
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[0],
+#if CONFIG_SDP
+                     track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+#endif  // CONFIG_SDP
+                     mi_row, mi_col, subsize);
+      if (mi_row + qbs_h >= mi_params->mi_rows) break;
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[1],
+#if CONFIG_SDP
+                     track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+#endif  // CONFIG_SDP
+                     mi_row + qbs_h, mi_col,
+                     get_partition_subsize(bsize, PARTITION_HORZ));
+      if (mi_row + 3 * qbs_h >= mi_params->mi_rows) break;
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[2],
+#if CONFIG_SDP
+                     track_ptree_luma ? ptree_luma->sub_tree[2] : NULL,
+#endif  // CONFIG_SDP
+                     mi_row + 3 * qbs_h, mi_col, subsize);
+      break;
+    case PARTITION_VERT_3:
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[0],
+#if CONFIG_SDP
+                     track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+#endif  // CONFIG_SDP
+                     mi_row, mi_col, subsize);
+      if (mi_col + qbs_w >= mi_params->mi_cols) break;
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[1],
+#if CONFIG_SDP
+                     track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+#endif  // CONFIG_SDP
+                     mi_row, mi_col + qbs_w,
+                     get_partition_subsize(bsize, PARTITION_VERT));
+      if (mi_col + 3 * qbs_w >= mi_params->mi_cols) break;
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[2],
+#if CONFIG_SDP
+                     track_ptree_luma ? ptree_luma->sub_tree[2] : NULL,
+#endif  // CONFIG_SDP
+                     mi_row, mi_col + 3 * qbs_w, subsize);
+      break;
+#else
     case PARTITION_SPLIT:
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
-                     subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[0], mi_row,
+                     mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[1], mi_row,
+                     mi_col + hbs_w, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[2],
+                     mi_row + hbs_h, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, ptree->sub_tree[3],
+                     mi_row + hbs_h, mi_col + hbs_w, subsize);
       break;
     case PARTITION_HORZ_A:
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs_w);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs_h, mi_col);
       break;
     case PARTITION_HORZ_B:
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs_h, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs_h, mi_col + hbs_w);
       break;
     case PARTITION_VERT_A:
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs_h, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs_w);
       break;
     case PARTITION_VERT_B:
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs_w);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs_h, mi_col + hbs_w);
       break;
     case PARTITION_HORZ_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * qbs_h;
         if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
-
         write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
       }
       break;
     case PARTITION_VERT_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * qbs_w;
         if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
-
         write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
       }
       break;
-    default: assert(0);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    default: assert(0); break;
   }
 
   // update partition context
@@ -2353,6 +2534,8 @@
 
     for (int mi_col = mi_col_start; mi_col < mi_col_end;
          mi_col += cm->seq_params.mib_size) {
+      av1_reset_is_mi_coded_map(xd, cm->seq_params.mib_size);
+      xd->sbi = av1_get_sb_info(cm, mi_row, mi_col);
       cpi->td.mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
 #if CONFIG_SDP
       const int total_loop_num =
@@ -2361,17 +2544,26 @@
               ? 2
               : 1;
       xd->tree_type = (total_loop_num == 1 ? SHARED_PART : LUMA_PART);
-#endif
-      write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
-                     cm->seq_params.sb_size);
-#if CONFIG_SDP
+      write_modes_sb(cpi, tile, w, &tok, tok_end,
+                     xd->sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)],
+#if CONFIG_EXT_RECUR_PARTITIONS
+                     NULL,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                     mi_row, mi_col, cm->seq_params.sb_size);
       if (total_loop_num == 2) {
         xd->tree_type = CHROMA_PART;
-        write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
-                       cm->seq_params.sb_size);
+        write_modes_sb(cpi, tile, w, &tok, tok_end,
+                       xd->sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)],
+#if CONFIG_EXT_RECUR_PARTITIONS
+                       xd->sbi->ptree_root[0],
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                       mi_row, mi_col, cm->seq_params.sb_size);
         xd->tree_type = SHARED_PART;
       }
-#endif
+#else
+      write_modes_sb(cpi, tile, w, &tok, tok_end, xd->sbi->ptree_root, mi_row,
+                     mi_col, cm->seq_params.sb_size);
+#endif  // CONFIG_SDP
     }
     assert(tok == tok_end);
   }
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 4bf2617..c18d28d 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -244,11 +244,7 @@
   //! \copydoc MB_MODE_INFO_EXT::mode_context
   int16_t mode_context;
   //! Offset of current coding block's coeff buffer relative to the sb.
-#if CONFIG_SDP
   int cb_offset[MAX_MB_PLANE];
-#else
-  int cb_offset;
-#endif
 } MB_MODE_INFO_EXT_FRAME;
 
 /*! \brief Txfm search results for a partition
@@ -412,6 +408,101 @@
   uint8_t *tmp_best_mask_buf;
 } CompoundTypeRdBuffers;
 
+/*!\cond */
+/*! \brief MV cost types
+ */
+enum {
+  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
+  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
+  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
+  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
+  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
+} UENUM1BYTE(MV_COST_TYPE);
+/*!\endcond */
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+/*! \brief max length of start Mv list
+ */
+#define kSMSMaxStartMVs 1
+/*! \brief Contains data for simple motion
+ */
+typedef struct SimpleMotionData {
+  MV mv_ref;                               /*!< mv reference */
+  MV fullmv;                               /*!< mv full */
+  MV submv;                                /*!< mv subpel */
+  unsigned int sse;                        /*!< sse */
+  unsigned int var;                        /*!< variance */
+  int64_t dist;                            /*!< distortion */
+  int rate;                                /*!< rate */
+  int64_t rdcost;                          /*!< rdcost */
+  int valid;                               /*!< whether valid */
+  BLOCK_SIZE bsize;                        /*!< blocksize */
+  int mi_row;                              /*!< row position in mi units */
+  int mi_col;                              /*!< col position in mi units */
+  MV_COST_TYPE mv_cost_type;               /*!< mv cost type */
+  int sadpb;                               /*!< sad per bit */
+  int errorperbit;                         /*!< error per bit */
+  MV start_mv_list[kSMSMaxStartMVs];       /*!< start mv list */
+  int num_start_mvs;                       /*!< number of start mvs */
+  int has_prev_partition;                  /*!< has previous partition */
+  PARTITION_TYPE prev_partition;           /*!< previous partition */
+  struct PICK_MODE_CONTEXT *mode_cache[1]; /*!< mode cache */
+} SimpleMotionData;
+
+/*!\cond */
+#define BLOCK_128_COUNT 1
+#define BLOCK_64_COUNT 3
+#define BLOCK_32_COUNT 7
+#define BLOCK_16_COUNT 15
+#define BLOCK_8_COUNT 31
+#define BLOCK_4_COUNT 32
+
+#define MAKE_SM_DATA_BUF(width, height) \
+  SimpleMotionData                      \
+      b_##width##x##height[BLOCK_##width##_COUNT * BLOCK_##height##_COUNT]
+/*!\endcond */
+
+/*! \brief Simple motion data buffers
+ */
+typedef struct SimpleMotionDataBufs {
+  /*!\cond */
+  // Square blocks
+  MAKE_SM_DATA_BUF(128, 128);
+  MAKE_SM_DATA_BUF(64, 64);
+  MAKE_SM_DATA_BUF(32, 32);
+  MAKE_SM_DATA_BUF(16, 16);
+  MAKE_SM_DATA_BUF(8, 8);
+  MAKE_SM_DATA_BUF(4, 4);
+
+  // 1:2 blocks
+  MAKE_SM_DATA_BUF(64, 128);
+  MAKE_SM_DATA_BUF(32, 64);
+  MAKE_SM_DATA_BUF(16, 32);
+  MAKE_SM_DATA_BUF(8, 16);
+  MAKE_SM_DATA_BUF(4, 8);
+
+  // 2:1 blocks
+  MAKE_SM_DATA_BUF(128, 64);
+  MAKE_SM_DATA_BUF(64, 32);
+  MAKE_SM_DATA_BUF(32, 16);
+  MAKE_SM_DATA_BUF(16, 8);
+  MAKE_SM_DATA_BUF(8, 4);
+
+  // 1:4 blocks
+  MAKE_SM_DATA_BUF(16, 64);
+  MAKE_SM_DATA_BUF(8, 32);
+  MAKE_SM_DATA_BUF(4, 16);
+
+  // 4:1 blocks
+  MAKE_SM_DATA_BUF(64, 16);
+  MAKE_SM_DATA_BUF(32, 8);
+  MAKE_SM_DATA_BUF(16, 4);
+  /*!\endcond */
+} SimpleMotionDataBufs;
+
+#undef MAKE_SM_DATA_BUF
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 /*! \brief Holds some parameters related to partitioning schemes in AV1.
  */
 // TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE
@@ -606,7 +697,11 @@
                     [EXT_PARTITION_TYPES];
 #else
   int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
-#endif
+#endif  // CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS
+  /*! Cost for coding the partition for rectangular blocks. */
+  int partition_rec_cost[PARTITION_CONTEXTS_REC][PARTITION_TYPES_REC];
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   /**@}*/
 
   /*****************************************************************************
@@ -937,11 +1032,7 @@
    */
   CB_COEFF_BUFFER *cb_coef_buff;
   //! Offset of current coding block's coeff buffer relative to the sb.
-#if CONFIG_SDP
   int cb_offset[MAX_MB_PLANE];
-#else
-  uint16_t cb_offset;
-#endif
 
   //! Modified source and masks used for fast OBMC search.
   OBMCBuffer obmc_buffer;
@@ -1206,6 +1297,14 @@
   unsigned int source_variance;
   //! SSE of the current predictor.
   unsigned int pred_sse[REF_FRAMES];
+#if CONFIG_EXT_RECUR_PARTITIONS
+  /*! Simple motion search buffers. */
+  SimpleMotionDataBufs *sms_bufs;
+  /*! \brief Determines what encoding decision should be reused. */
+  int reuse_inter_mode_cache_type;
+  /*! \brief The mode to reuse during \ref av1_rd_pick_inter_mode_sb. */
+  MB_MODE_INFO *inter_mode_cache;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   /**@}*/
 } MACROBLOCK;
 #undef SINGLE_REF_MODES
@@ -1321,6 +1420,11 @@
   return (txb_skip[blk_idx] >> plane) & 1;
 }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static INLINE int should_reuse_mode(const MACROBLOCK *x, int mode_flag) {
+  return x->reuse_inter_mode_cache_type & mode_flag;
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 /*!\endcond */
 
 #ifdef __cplusplus
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index f33094d..0162b55 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -457,8 +457,12 @@
   const int bw = block_size_wide[bsize];
   mbmi->interintra_mode = interintra_mode;
   int rmode = interintra_mode_cost[interintra_mode];
+#if CONFIG_EXT_RECUR_PARTITIONS
+  av1_build_intra_predictors_for_interintra(cm, xd, 0, orig_dst, intrapred, bw);
+#else
   av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                             intrapred, bw);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
   model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist,
                                           &skip_txfm_sb, &skip_sse_sb, NULL,
@@ -520,8 +524,13 @@
   uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
   for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) {
     mbmi->interintra_mode = mode;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    av1_build_intra_predictors_for_interintra(cm, xd, 0, orig_dst, intrapred,
+                                              bw);
+#else
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                               intrapred, bw);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
     const int rate_overhead =
         interintra_mode_cost[mode] +
@@ -575,8 +584,13 @@
                                *best_interintra_mode != INTERINTRA_MODES;
   if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) {
     mbmi->interintra_mode = *best_interintra_mode;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    av1_build_intra_predictors_for_interintra(cm, xd, 0, orig_dst, intrapred,
+                                              bw);
+#else
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                               intrapred, bw);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
   }
 
@@ -634,15 +648,25 @@
     mbmi->interintra_mode = best_mode;
     mbmi->interintra_wedge_index = best_wedge_index;
     if (best_mode != INTERINTRA_MODES - 1) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+      av1_build_intra_predictors_for_interintra(cm, xd, 0, orig_dst, intrapred,
+                                                bw);
+#else
       av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                                 intrapred, bw);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     }
   } else if (!try_smooth_interintra) {
     if (*best_interintra_mode == INTERINTRA_MODES) {
       mbmi->interintra_mode = INTERINTRA_MODES - 1;
       *best_interintra_mode = INTERINTRA_MODES - 1;
+#if CONFIG_EXT_RECUR_PARTITIONS
+      av1_build_intra_predictors_for_interintra(cm, xd, 0, orig_dst, intrapred,
+                                                bw);
+#else
       av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                                 intrapred, bw);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       // Pick wedge mask based on INTERINTRA_MODES - 1
       *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
       // Find the best interintra mode for the chosen wedge mask
@@ -657,14 +681,24 @@
 
       // Recompute prediction if required
       if (*best_interintra_mode != INTERINTRA_MODES - 1) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+        av1_build_intra_predictors_for_interintra(cm, xd, 0, orig_dst,
+                                                  intrapred, bw);
+#else
         av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                                   intrapred, bw);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       }
     } else {
       // Pick wedge mask for the best interintra mode (reused)
       mbmi->interintra_mode = *best_interintra_mode;
+#if CONFIG_EXT_RECUR_PARTITIONS
+      av1_build_intra_predictors_for_interintra(cm, xd, 0, orig_dst, intrapred,
+                                                bw);
+#else
       av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                                 intrapred, bw);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
     }
   } else {
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 6d07ef2..028fd3d 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -11,6 +11,7 @@
 
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
 
 static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
   BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
@@ -38,6 +39,12 @@
 
   dst_ctx->rd_stats = src_ctx->rd_stats;
   dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  for (int i = 0; i < 2; ++i) {
+    memcpy(dst_ctx->color_index_map[i], src_ctx->color_index_map[i],
+           sizeof(src_ctx->color_index_map[i][0]) * src_ctx->num_4x4_blk * 16);
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 void av1_setup_shared_coeff_buffer(AV1_COMMON *cm,
@@ -64,13 +71,23 @@
   }
 }
 
-PICK_MODE_CONTEXT *av1_alloc_pmc(const AV1_COMMON *cm, BLOCK_SIZE bsize,
+PICK_MODE_CONTEXT *av1_alloc_pmc(const AV1_COMMON *cm, int mi_row, int mi_col,
+                                 BLOCK_SIZE bsize, PC_TREE *parent,
+                                 PARTITION_TYPE parent_partition, int index,
+                                 int subsampling_x, int subsampling_y,
                                  PC_TREE_SHARED_BUFFERS *shared_bufs) {
   PICK_MODE_CONTEXT *ctx = NULL;
   struct aom_internal_error_info error;
 
   AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx)));
   ctx->rd_mode_is_ready = 0;
+  ctx->parent = parent;
+  ctx->index = index;
+  set_chroma_ref_info(mi_row, mi_col, index, bsize, &ctx->chroma_ref_info,
+                      parent ? &parent->chroma_ref_info : NULL,
+                      parent ? parent->block_size : BLOCK_INVALID,
+                      parent_partition, subsampling_x, subsampling_y);
+  ctx->mic.chroma_ref_info = ctx->chroma_ref_info;
 
   const int num_planes = av1_num_planes(cm);
   const int num_pix = block_size_wide[bsize] * block_size_high[bsize];
@@ -100,7 +117,7 @@
           aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
     }
   }
-
+  av1_invalid_rd_stats(&ctx->rd_stats);
   return ctx;
 }
 
@@ -128,30 +145,52 @@
   aom_free(ctx);
 }
 
-PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) {
+PC_TREE *av1_alloc_pc_tree_node(int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                PC_TREE *parent,
+                                PARTITION_TYPE parent_partition, int index,
+                                int is_last, int subsampling_x,
+                                int subsampling_y) {
   PC_TREE *pc_tree = NULL;
   struct aom_internal_error_info error;
 
   AOM_CHECK_MEM_ERROR(&error, pc_tree, aom_calloc(1, sizeof(*pc_tree)));
 
+  pc_tree->mi_row = mi_row;
+  pc_tree->mi_col = mi_col;
+  pc_tree->parent = parent;
+  pc_tree->index = index;
   pc_tree->partitioning = PARTITION_NONE;
   pc_tree->block_size = bsize;
-  pc_tree->index = 0;
+  pc_tree->is_last_subblock = is_last;
+  av1_invalid_rd_stats(&pc_tree->rd_cost);
+  set_chroma_ref_info(mi_row, mi_col, index, bsize, &pc_tree->chroma_ref_info,
+                      parent ? &parent->chroma_ref_info : NULL,
+                      parent ? parent->block_size : BLOCK_INVALID,
+                      parent_partition, subsampling_x, subsampling_y);
 
   pc_tree->none = NULL;
   for (int i = 0; i < 2; ++i) {
     pc_tree->horizontal[i] = NULL;
     pc_tree->vertical[i] = NULL;
   }
+#if CONFIG_EXT_RECUR_PARTITIONS
+  for (int i = 0; i < 3; ++i) {
+    pc_tree->horizontal3[i] = NULL;
+    pc_tree->vertical3[i] = NULL;
+  }
+#else
   for (int i = 0; i < 3; ++i) {
     pc_tree->horizontala[i] = NULL;
     pc_tree->horizontalb[i] = NULL;
     pc_tree->verticala[i] = NULL;
     pc_tree->verticalb[i] = NULL;
   }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   for (int i = 0; i < 4; ++i) {
+#if !CONFIG_EXT_RECUR_PARTITIONS
     pc_tree->horizontal4[i] = NULL;
     pc_tree->vertical4[i] = NULL;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
     pc_tree->split[i] = NULL;
   }
 
@@ -174,11 +213,38 @@
     FREE_PMC_NODE(pc_tree->none);
 
   for (int i = 0; i < 2; ++i) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    if ((!keep_best || (partition != PARTITION_HORZ)) &&
+        pc_tree->horizontal[i] != NULL) {
+      av1_free_pc_tree_recursive(pc_tree->horizontal[i], num_planes, 0, 0);
+      pc_tree->horizontal[i] = NULL;
+    }
+    if ((!keep_best || (partition != PARTITION_VERT)) &&
+        pc_tree->vertical[i] != NULL) {
+      av1_free_pc_tree_recursive(pc_tree->vertical[i], num_planes, 0, 0);
+      pc_tree->vertical[i] = NULL;
+    }
+#else
     if (!keep_best || (partition != PARTITION_HORZ))
       FREE_PMC_NODE(pc_tree->horizontal[i]);
     if (!keep_best || (partition != PARTITION_VERT))
       FREE_PMC_NODE(pc_tree->vertical[i]);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   }
+#if CONFIG_EXT_RECUR_PARTITIONS
+  for (int i = 0; i < 3; ++i) {
+    if ((!keep_best || (partition != PARTITION_HORZ_3)) &&
+        pc_tree->horizontal3[i] != NULL) {
+      av1_free_pc_tree_recursive(pc_tree->horizontal3[i], num_planes, 0, 0);
+      pc_tree->horizontal3[i] = NULL;
+    }
+    if ((!keep_best || (partition != PARTITION_VERT_3)) &&
+        pc_tree->vertical3[i] != NULL) {
+      av1_free_pc_tree_recursive(pc_tree->vertical3[i], num_planes, 0, 0);
+      pc_tree->vertical3[i] = NULL;
+    }
+  }
+#else
   for (int i = 0; i < 3; ++i) {
     if (!keep_best || (partition != PARTITION_HORZ_A))
       FREE_PMC_NODE(pc_tree->horizontala[i]);
@@ -195,6 +261,7 @@
     if (!keep_best || (partition != PARTITION_VERT_4))
       FREE_PMC_NODE(pc_tree->vertical4[i]);
   }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   if (!keep_best || (partition != PARTITION_SPLIT)) {
     for (int i = 0; i < 4; ++i) {
@@ -208,6 +275,144 @@
   if (!keep_best && !keep_none) aom_free(pc_tree);
 }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+void av1_copy_pc_tree_recursive(const AV1_COMMON *cm, PC_TREE *dst,
+                                PC_TREE *src, int ss_x, int ss_y,
+                                PC_TREE_SHARED_BUFFERS *shared_bufs,
+                                int num_planes) {
+  // Copy the best partition type. For basic information like bsize and index,
+  // we assume they have been set properly when initializing the dst PC_TREE
+  dst->partitioning = src->partitioning;
+  dst->rd_cost = src->rd_cost;
+  const BLOCK_SIZE bsize = dst->block_size;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, src->partitioning);
+  const int mi_row = src->mi_row;
+  const int mi_col = src->mi_col;
+
+  switch (src->partitioning) {
+    // PARTITION_NONE
+    case PARTITION_NONE:
+      if (dst->none) av1_free_pmc(dst->none, num_planes);
+      dst->none = NULL;
+      if (src->none) {
+        dst->none = av1_alloc_pmc(cm, mi_row, mi_col, bsize, dst,
+                                  PARTITION_NONE, 0, ss_x, ss_y, shared_bufs);
+        av1_copy_tree_context(dst->none, src->none);
+      }
+      break;
+    // PARTITION_SPLIT
+    case PARTITION_SPLIT:
+      if (is_partition_valid(bsize, PARTITION_SPLIT)) {
+        for (int i = 0; i < 4; ++i) {
+          if (dst->split[i]) {
+            av1_free_pc_tree_recursive(dst->split[i], num_planes, 0, 0);
+            dst->split[i] = NULL;
+          }
+          if (src->split[i]) {
+            const int x_idx = (i & 1) * (mi_size_wide[bsize] >> 1);
+            const int y_idx = (i >> 1) * (mi_size_high[bsize] >> 1);
+            dst->split[i] = av1_alloc_pc_tree_node(
+                mi_row + y_idx, mi_col + x_idx, subsize, dst, PARTITION_SPLIT,
+                i, i == 3, ss_x, ss_y);
+            av1_copy_pc_tree_recursive(cm, dst->split[i], src->split[i], ss_x,
+                                       ss_y, shared_bufs, num_planes);
+          }
+        }
+      }
+      break;
+    // PARTITION_HORZ
+    case PARTITION_HORZ:
+      if (is_partition_valid(bsize, PARTITION_HORZ)) {
+        for (int i = 0; i < 2; ++i) {
+          if (dst->horizontal[i]) {
+            av1_free_pc_tree_recursive(dst->horizontal[i], num_planes, 0, 0);
+            dst->horizontal[i] = NULL;
+          }
+          if (src->horizontal[i]) {
+            const int this_mi_row = mi_row + i * (mi_size_high[bsize] >> 1);
+            dst->horizontal[i] =
+                av1_alloc_pc_tree_node(this_mi_row, mi_col, subsize, dst,
+                                       PARTITION_HORZ, i, i == 1, ss_x, ss_y);
+            av1_copy_pc_tree_recursive(cm, dst->horizontal[i],
+                                       src->horizontal[i], ss_x, ss_y,
+                                       shared_bufs, num_planes);
+          }
+        }
+      }
+      break;
+    // PARTITION_VERT
+    case PARTITION_VERT:
+      if (is_partition_valid(bsize, PARTITION_VERT)) {
+        for (int i = 0; i < 2; ++i) {
+          if (dst->vertical[i]) {
+            av1_free_pc_tree_recursive(dst->vertical[i], num_planes, 0, 0);
+            dst->vertical[i] = NULL;
+          }
+          if (src->vertical[i]) {
+            const int this_mi_col = mi_col + i * (mi_size_wide[bsize] >> 1);
+            dst->vertical[i] =
+                av1_alloc_pc_tree_node(mi_row, this_mi_col, subsize, dst,
+                                       PARTITION_VERT, i, i == 1, ss_x, ss_y);
+            av1_copy_pc_tree_recursive(cm, dst->vertical[i], src->vertical[i],
+                                       ss_x, ss_y, shared_bufs, num_planes);
+          }
+        }
+      }
+      break;
+    // PARTITION_HORZ_3
+    case PARTITION_HORZ_3:
+      if (is_partition_valid(bsize, PARTITION_HORZ_3)) {
+        const int mi_rows[3] = { mi_row, mi_row + (mi_size_high[bsize] >> 2),
+                                 mi_row + (mi_size_high[bsize] >> 2) * 3 };
+        const BLOCK_SIZE subsizes[3] = {
+          subsize, get_partition_subsize(bsize, PARTITION_HORZ), subsize
+        };
+
+        for (int i = 0; i < 3; ++i) {
+          if (dst->horizontal3[i]) {
+            av1_free_pc_tree_recursive(dst->horizontal3[i], num_planes, 0, 0);
+            dst->horizontal3[i] = NULL;
+          }
+          if (src->horizontal3[i]) {
+            dst->horizontal3[i] =
+                av1_alloc_pc_tree_node(mi_rows[i], mi_col, subsizes[i], dst,
+                                       PARTITION_HORZ_3, i, i == 2, ss_x, ss_y);
+            av1_copy_pc_tree_recursive(cm, dst->horizontal3[i],
+                                       src->horizontal3[i], ss_x, ss_y,
+                                       shared_bufs, num_planes);
+          }
+        }
+      }
+      break;
+    // PARTITION_VERT_3
+    case PARTITION_VERT_3:
+      if (is_partition_valid(bsize, PARTITION_VERT_3)) {
+        const int mi_cols[3] = { mi_col, mi_col + (mi_size_wide[bsize] >> 2),
+                                 mi_col + (mi_size_wide[bsize] >> 2) * 3 };
+        const BLOCK_SIZE subsizes[3] = {
+          subsize, get_partition_subsize(bsize, PARTITION_VERT), subsize
+        };
+
+        for (int i = 0; i < 3; ++i) {
+          if (dst->vertical3[i]) {
+            av1_free_pc_tree_recursive(dst->vertical3[i], num_planes, 0, 0);
+            dst->vertical3[i] = NULL;
+          }
+          if (src->vertical3[i]) {
+            dst->vertical3[i] =
+                av1_alloc_pc_tree_node(mi_row, mi_cols[i], subsizes[i], dst,
+                                       PARTITION_VERT_3, i, i == 2, ss_x, ss_y);
+            av1_copy_pc_tree_recursive(cm, dst->vertical3[i], src->vertical3[i],
+                                       ss_x, ss_y, shared_bufs, num_planes);
+          }
+        }
+      }
+      break;
+    default: assert(0 && "Not a valid partition."); break;
+  }
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
                                         int stat_generation_stage) {
   const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
@@ -272,3 +477,68 @@
     td->sms_tree = NULL;
   }
 }
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+void av1_setup_sms_bufs(AV1_COMMON *cm, ThreadData *td) {
+  CHECK_MEM_ERROR(cm, td->sms_bufs, aom_malloc(sizeof(*td->sms_bufs)));
+}
+
+void av1_free_sms_bufs(ThreadData *td) {
+  if (td->sms_bufs != NULL) {
+    aom_free(td->sms_bufs);
+    td->sms_bufs = NULL;
+  }
+}
+
+PC_TREE *counterpart_from_different_partition(PC_TREE *pc_tree,
+                                              PC_TREE *target);
+
+static PC_TREE *look_for_counterpart_helper(PC_TREE *cur, PC_TREE *target) {
+  if (cur == NULL || cur == target) return NULL;
+
+  BLOCK_SIZE current_bsize = cur->block_size;
+  BLOCK_SIZE target_bsize = target->block_size;
+  if (current_bsize == target_bsize) {
+    return cur;
+  } else {
+    if (mi_size_wide[current_bsize] >= mi_size_wide[target_bsize] &&
+        mi_size_high[current_bsize] >= mi_size_high[target_bsize]) {
+      return counterpart_from_different_partition(cur, target);
+    } else {
+      return NULL;
+    }
+  }
+}
+
+PC_TREE *counterpart_from_different_partition(PC_TREE *pc_tree,
+                                              PC_TREE *target) {
+  if (pc_tree == NULL || pc_tree == target) return NULL;
+
+  PC_TREE *result;
+  result = look_for_counterpart_helper(pc_tree->split[0], target);
+  if (result) return result;
+  result = look_for_counterpart_helper(pc_tree->horizontal[0], target);
+  if (result) return result;
+  result = look_for_counterpart_helper(pc_tree->vertical[0], target);
+  if (result) return result;
+  result = look_for_counterpart_helper(pc_tree->horizontal3[0], target);
+  if (result) return result;
+  result = look_for_counterpart_helper(pc_tree->vertical3[0], target);
+  if (result) return result;
+
+  return NULL;
+}
+
+PC_TREE *av1_look_for_counterpart_block(PC_TREE *pc_tree) {
+  if (!pc_tree) return 0;
+
+  // Find the highest possible common parent node
+  PC_TREE *current = pc_tree;
+  while (current->index == 0 && current->parent) {
+    current = current->parent;
+  }
+
+  // Search from the highest common ancester
+  return counterpart_from_different_partition(current, pc_tree);
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index f243233..f03414b 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -60,12 +60,21 @@
 
   int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
                          // been made.
+  CHROMA_REF_INFO chroma_ref_info;
+  struct PC_TREE *parent;
+  int index;
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
   PARTITION_TYPE partitioning;
   BLOCK_SIZE block_size;
   PICK_MODE_CONTEXT *none;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  struct PC_TREE *horizontal[2];
+  struct PC_TREE *vertical[2];
+  struct PC_TREE *horizontal3[3];
+  struct PC_TREE *vertical3[3];
+#else
   PICK_MODE_CONTEXT *horizontal[2];
   PICK_MODE_CONTEXT *vertical[2];
   PICK_MODE_CONTEXT *horizontala[3];
@@ -74,8 +83,15 @@
   PICK_MODE_CONTEXT *verticalb[3];
   PICK_MODE_CONTEXT *horizontal4[4];
   PICK_MODE_CONTEXT *vertical4[4];
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   struct PC_TREE *split[4];
+  struct PC_TREE *parent;
+  int mi_row;
+  int mi_col;
   int index;
+  int is_last_subblock;
+  CHROMA_REF_INFO chroma_ref_info;
+  RD_STATS rd_cost;
 } PC_TREE;
 
 typedef struct SIMPLE_MOTION_DATA_TREE {
@@ -91,15 +107,32 @@
   int sms_rect_valid;
 } SIMPLE_MOTION_DATA_TREE;
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+PC_TREE *av1_look_for_counterpart_block(PC_TREE *pc_tree);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 void av1_setup_shared_coeff_buffer(AV1_COMMON *cm,
                                    PC_TREE_SHARED_BUFFERS *shared_bufs);
 void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs);
 
-PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize);
+PC_TREE *av1_alloc_pc_tree_node(int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                PC_TREE *parent,
+                                PARTITION_TYPE parent_partition, int index,
+                                int is_last, int subsampling_x,
+                                int subsampling_y);
 void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best,
                                 int keep_none);
+#if CONFIG_EXT_RECUR_PARTITIONS
+void av1_copy_pc_tree_recursive(const AV1_COMMON *cm, PC_TREE *dst,
+                                PC_TREE *src, int ss_x, int ss_y,
+                                PC_TREE_SHARED_BUFFERS *shared_bufs,
+                                int num_planes);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
-PICK_MODE_CONTEXT *av1_alloc_pmc(const AV1_COMMON *cm, BLOCK_SIZE bsize,
+PICK_MODE_CONTEXT *av1_alloc_pmc(const AV1_COMMON *cm, int mi_row, int mi_col,
+                                 BLOCK_SIZE bsize, PC_TREE *parent,
+                                 PARTITION_TYPE parent_partition, int index,
+                                 int subsampling_x, int subsampling_y,
                                  PC_TREE_SHARED_BUFFERS *shared_bufs);
 void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes);
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
@@ -107,6 +140,10 @@
 
 void av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
 void av1_free_sms_tree(struct ThreadData *td);
+#if CONFIG_EXT_RECUR_PARTITIONS
+void av1_setup_sms_bufs(struct AV1Common *cm, struct ThreadData *td);
+void av1_free_sms_bufs(struct ThreadData *td);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 8adec1e..3c05fb6 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -881,7 +881,7 @@
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params.sb_size);
+  av1_setup_src_planes(x, src, 0, 0, num_planes, NULL);
 
   av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
                          cm->seq_params.subsampling_y, num_planes);
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 0592a0b..86b7fcc 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -198,7 +198,7 @@
 
 void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col, const int num_planes,
-                          BLOCK_SIZE bsize) {
+                          const CHROMA_REF_INFO *chr_ref_info) {
   // Set current frame pointer.
   x->e_mbd.cur_buf = src;
 
@@ -206,10 +206,10 @@
   // the static analysis warnings.
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
     const int is_uv = i > 0;
-    setup_pred_plane(
-        &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv],
-        src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL,
-        x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y);
+    setup_pred_plane(&x->plane[i].src, src->buffers[i], src->crop_widths[is_uv],
+                     src->crop_heights[is_uv], src->strides[is_uv], mi_row,
+                     mi_col, NULL, x->e_mbd.plane[i].subsampling_x,
+                     x->e_mbd.plane[i].subsampling_y, chr_ref_info);
   }
 }
 
@@ -240,7 +240,7 @@
 
   const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
   // Delta-q modulation based on variance
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, NULL);
 
   int current_qindex = cm->quant_params.base_qindex;
   if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
@@ -287,7 +287,7 @@
   assert(current_qindex > 0);
 
   x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
-  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size, NULL);
   xd->mi[0]->current_qindex = current_qindex;
   av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
 
@@ -483,6 +483,16 @@
   reset_hash_records(&x->txfm_search_info, cpi->sf.tx_sf.use_inter_txb_hash);
   av1_zero(x->picked_ref_frames_mask);
   av1_invalid_rd_stats(rd_cost);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  av1_init_sms_data_bufs(x->sms_bufs);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_SDP
+  if (x->e_mbd.tree_type == CHROMA_PART) {
+    assert(is_bsize_square(x->sb_enc.min_partition_size));
+    x->sb_enc.min_partition_size =
+        AOMMAX(x->sb_enc.min_partition_size, BLOCK_8X8);
+  }
+#endif  // CONFIG_SDP
 }
 
 /*!\brief Encode a superblock (RD-search-based)
@@ -507,6 +517,11 @@
   int64_t dummy_dist;
   RD_STATS dummy_rdc;
   SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root;
+  const int ss_x = cm->seq_params.subsampling_x;
+  const int ss_y = cm->seq_params.subsampling_y;
+  (void)tile_info;
+  (void)num_planes;
+  (void)mi;
 
 #if CONFIG_SDP
   const int total_loop_num =
@@ -514,58 +529,110 @@
        cm->seq_params.enable_sdp)
           ? 2
           : 1;
+#endif  // CONFIG_SDP
+#if CONFIG_SDP || CONFIG_EXT_RECUR_PARTITIONS
   MACROBLOCKD *const xd = &x->e_mbd;
-#endif
+#endif  // CONFIG_SDP || CONFIG_EXT_RECUR_PARTITIONS
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  x->sms_bufs = td->sms_bufs;
+  x->reuse_inter_mode_cache_type = cpi->sf.inter_sf.reuse_erp_mode_flag;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col,
                     1);
 
   // Encode the superblock
   if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
     // partition search by adjusting a fixed-size partition
-    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size, NULL);
     const BLOCK_SIZE bsize =
         seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
     av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
 #if CONFIG_SDP
     for (int loop_idx = 0; loop_idx < total_loop_num; loop_idx++) {
+      const BLOCK_SIZE min_partition_size = x->sb_enc.min_partition_size;
       xd->tree_type =
           (total_loop_num == 1 ? SHARED_PART
                                : (loop_idx == 0 ? LUMA_PART : CHROMA_PART));
       init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
                         mi_col, 1);
 #endif
-      PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+#if CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_SDP
+      av1_reset_ptree_in_sbi(xd->sbi, xd->tree_type);
+      av1_build_partition_tree_fixed_partitioning(
+          cm, mi_row, mi_col, bsize,
+          xd->sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)]);
+#else
+      av1_reset_ptree_in_sbi(xd->sbi);
+      av1_build_partition_tree_fixed_partitioning(cm, mi_row, mi_col, bsize,
+                                                  xd->sbi->ptree_root);
+#endif  // CONFIG_SDP
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      PC_TREE *const pc_root = av1_alloc_pc_tree_node(
+          mi_row, mi_col, sb_size, NULL, PARTITION_NONE, 0, 1, ss_x, ss_y);
       av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                           &dummy_rate, &dummy_dist, 1, pc_root);
+                           &dummy_rate, &dummy_dist, 1,
+#if CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+                           xd->sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)],
+#elif CONFIG_EXT_RECUR_PARTITIONS
+                         xd->sbi->ptree_root,
+#else   // !CONFIG_EXT_RECUR_PARTITIONS
+                         NULL,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+                           pc_root);
       av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
 #if CONFIG_SDP
+      x->sb_enc.min_partition_size = min_partition_size;
     }
     xd->tree_type = SHARED_PART;
 #endif
   } else if (cpi->partition_search_skippable_frame) {
     // partition search by adjusting a fixed-size partition for which the size
     // is determined by the source variance
-    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size, NULL);
     const BLOCK_SIZE bsize =
         get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
     av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
 #if CONFIG_SDP
     for (int loop_idx = 0; loop_idx < total_loop_num; loop_idx++) {
+      const BLOCK_SIZE min_partition_size = x->sb_enc.min_partition_size;
       xd->tree_type =
           (total_loop_num == 1 ? SHARED_PART
                                : (loop_idx == 0 ? LUMA_PART : CHROMA_PART));
       init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
                         mi_col, 1);
 #endif
-      PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+      PC_TREE *const pc_root = av1_alloc_pc_tree_node(
+          mi_row, mi_col, sb_size, NULL, PARTITION_NONE, 0, 1, ss_x, ss_y);
+#if CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_SDP
+      av1_reset_ptree_in_sbi(xd->sbi, xd->tree_type);
+      av1_build_partition_tree_fixed_partitioning(
+          cm, mi_row, mi_col, bsize,
+          xd->sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)]);
+#else
+      av1_reset_ptree_in_sbi(xd->sbi);
+      av1_build_partition_tree_fixed_partitioning(cm, mi_row, mi_col, bsize,
+                                                  xd->sbi->ptree_root);
+#endif  // CONFIG_SDP
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                           &dummy_rate, &dummy_dist, 1, pc_root);
+                           &dummy_rate, &dummy_dist, 1,
+#if CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+                           xd->sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)],
+#elif CONFIG_EXT_RECUR_PARTITIONS
+                         xd->sbi->ptree_root,
+#else   // !CONFIG_EXT_RECUR_PARTITIONS
+                         NULL,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS && CONFIG_SDP
+                           pc_root);
       av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
 #if CONFIG_SDP
+      x->sb_enc.min_partition_size = min_partition_size;
     }
     xd->tree_type = SHARED_PART;
-#endif
+#endif  // CONFIG_SDP
   } else {
     // The most exhaustive recursive partition search
     SuperBlockEnc *sb_enc = &x->sb_enc;
@@ -592,17 +659,24 @@
     if (num_passes == 1) {
 #if CONFIG_SDP
       for (int loop_idx = 0; loop_idx < total_loop_num; loop_idx++) {
+        const BLOCK_SIZE min_partition_size = sb_enc->min_partition_size;
         xd->tree_type =
             (total_loop_num == 1 ? SHARED_PART
                                  : (loop_idx == 0 ? LUMA_PART : CHROMA_PART));
         init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
                           mi_col, 1);
 #endif
-        PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
-        av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                              &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
-                              SB_SINGLE_PASS, NULL);
+        PC_TREE *const pc_root = av1_alloc_pc_tree_node(
+            mi_row, mi_col, sb_size, NULL, PARTITION_NONE, 0, 1, ss_x, ss_y);
+        av1_rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row, mi_col, sb_size, &dummy_rdc,
+            dummy_rdc, pc_root,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+            xd->tree_type == CHROMA_PART ? xd->sbi->ptree_root[0] : NULL, NULL,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+            sms_root, NULL, SB_SINGLE_PASS, NULL);
 #if CONFIG_SDP
+        sb_enc->min_partition_size = min_partition_size;
       }
       xd->tree_type = SHARED_PART;
 #endif
@@ -612,17 +686,24 @@
       av1_backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
 #if CONFIG_SDP
       for (int loop_idx = 0; loop_idx < total_loop_num; loop_idx++) {
+        const BLOCK_SIZE min_partition_size = sb_enc->min_partition_size;
         xd->tree_type =
             (total_loop_num == 1 ? SHARED_PART
                                  : (loop_idx == 0 ? LUMA_PART : CHROMA_PART));
         init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
                           mi_col, 1);
 #endif
-        PC_TREE *const pc_root_p0 = av1_alloc_pc_tree_node(sb_size);
-        av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                              &dummy_rdc, dummy_rdc, pc_root_p0, sms_root, NULL,
-                              SB_DRY_PASS, NULL);
+        PC_TREE *const pc_root_p0 = av1_alloc_pc_tree_node(
+            mi_row, mi_col, sb_size, NULL, PARTITION_NONE, 0, 1, ss_x, ss_y);
+        av1_rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row, mi_col, sb_size, &dummy_rdc,
+            dummy_rdc, pc_root_p0,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+            xd->tree_type == CHROMA_PART ? xd->sbi->ptree_root[0] : NULL, NULL,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+            sms_root, NULL, SB_DRY_PASS, NULL);
 #if CONFIG_SDP
+        sb_enc->min_partition_size = min_partition_size;
       }
       xd->tree_type = SHARED_PART;
 #endif
@@ -636,17 +717,25 @@
       av1_restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
 #if CONFIG_SDP
       for (int loop_idx = 0; loop_idx < total_loop_num; loop_idx++) {
+        const BLOCK_SIZE min_partition_size = sb_enc->min_partition_size;
         xd->tree_type =
             (total_loop_num == 1 ? SHARED_PART
                                  : (loop_idx == 0 ? LUMA_PART : CHROMA_PART));
         init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
                           mi_col, 1);
 #endif
-        PC_TREE *const pc_root_p1 = av1_alloc_pc_tree_node(sb_size);
-        av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                              &dummy_rdc, dummy_rdc, pc_root_p1, sms_root, NULL,
-                              SB_WET_PASS, NULL);
+
+        PC_TREE *const pc_root_p1 = av1_alloc_pc_tree_node(
+            mi_row, mi_col, sb_size, NULL, PARTITION_NONE, 0, 1, ss_x, ss_y);
+        av1_rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row, mi_col, sb_size, &dummy_rdc,
+            dummy_rdc, pc_root_p1,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+            xd->tree_type == CHROMA_PART ? xd->sbi->ptree_root[0] : NULL, NULL,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+            sms_root, NULL, SB_WET_PASS, NULL);
 #if CONFIG_SDP
+        sb_enc->min_partition_size = min_partition_size;
       }
       xd->tree_type = SHARED_PART;
 #endif
@@ -713,7 +802,9 @@
   for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
        mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
     (*(enc_row_mt->sync_read_ptr))(row_mt_sync, sb_row, sb_col_in_tile);
+    av1_reset_is_mi_coded_map(xd, cm->seq_params.mib_size);
 
+    av1_set_sb_info(cm, xd, mi_row, mi_col);
     if (tile_data->allow_update_cdf && row_mt_enabled &&
         (tile_info->mi_row_start != mi_row)) {
       if ((tile_info->mi_col_start == mi_col)) {
@@ -778,8 +869,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Copy data over into macro block data structures.
-  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
-                       cm->seq_params.sb_size);
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, NULL);
 
   av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
                          cm->seq_params.subsampling_y, num_planes);
diff --git a/av1/encoder/encodeframe.h b/av1/encoder/encodeframe.h
index 36b38d5..f6f97b4 100644
--- a/av1/encoder/encodeframe.h
+++ b/av1/encoder/encodeframe.h
@@ -33,7 +33,8 @@
 
 void av1_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src, int mi_row,
-                          int mi_col, const int num_planes, BLOCK_SIZE bsize);
+                          int mi_col, const int num_planes,
+                          const CHROMA_REF_INFO *chr_ref_info);
 
 void av1_encode_frame(struct AV1_COMP *cpi);
 
@@ -43,6 +44,9 @@
                      int tile_col);
 void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
                        int tile_row, int tile_col, int mi_row);
+void av1_enc_set_offsets(const struct AV1_COMP *cpi, const TileInfo *const tile,
+                         struct macroblock *const x, int mi_row, int mi_col,
+                         BLOCK_SIZE bsize, CHROMA_REF_INFO *chr_ref_info);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index eddfe72..6ef3968 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -682,9 +682,9 @@
   }
 }
 
-void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                         int mi_row, int mi_col, BLOCK_SIZE bsize,
-                         const int num_planes) {
+void av1_restore_context(const AV1_COMMON *cm, MACROBLOCK *x,
+                         const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
+                         int mi_col, BLOCK_SIZE bsize, const int num_planes) {
   MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = mi_size_wide[bsize];
@@ -727,6 +727,9 @@
          sizeof(*xd->above_txfm_context) * mi_width);
   memcpy(xd->left_txfm_context, ctx->tl,
          sizeof(*xd->left_txfm_context) * mi_height);
+
+  av1_mark_block_as_not_coded(xd, mi_row, mi_col, bsize,
+                              cm->seq_params.sb_size);
 }
 
 void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
@@ -1148,13 +1151,16 @@
 void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
                                        BLOCK_SIZE bsize, int mib_size,
                                        int mi_row, int mi_col) {
+#if !CONFIG_EXT_RECUR_PARTITIONS
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   const int sb_size_mask = mib_size - 1;
   const int mi_row_in_sb = mi_row & sb_size_mask;
   const int mi_col_in_sb = mi_col & sb_size_mask;
-  const int mi_size = mi_size_wide[bsize];
-  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
-    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+  const int mi_size_h = mi_size_high[bsize];
+  const int mi_size_w = mi_size_wide[bsize];
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size_h; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size_w; ++j) {
       x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
     }
   }
@@ -1342,7 +1348,13 @@
                      CDF_SIZE(10));
     }
   }
-#endif
+#endif  // CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS
+  for (int i = 0; i < PARTITION_CONTEXTS_REC; ++i) {
+    AVERAGE_CDF(ctx_left->partition_rec_cdf[i], ctx_tr->partition_rec_cdf[i],
+                4);
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
               SWITCHABLE_FILTERS);
   AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
@@ -1477,7 +1489,7 @@
   const int num_planes = av1_num_planes(cm);
   const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
 
-  av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
+  av1_restore_context(cm, x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
                       num_planes);
 
   cpi->td.rd_counts = sb_fp_stats->rd_count;
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 5fde760..b39fe48 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -64,9 +64,13 @@
 
 // This structure contains block size related
 // variables for use in rd_pick_partition().
-typedef struct {
+typedef struct PartitionBlkParams {
   // Half of block width to determine block edge.
   int mi_step;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  int mi_step_h;
+  int mi_step_w;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   // Block row and column indices.
   int mi_row;
@@ -79,11 +83,18 @@
   // Block width of current partition block.
   int width;
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  // Minimum partition size allowed.
+  BLOCK_SIZE min_partition_size;
+#else
   // Block width of minimum partition size allowed.
   int min_partition_size_1d;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
   // Flag to indicate if partition is 8x8 or higher size.
   int bsize_at_least_8x8;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
   // Indicates edge blocks in frame.
   int has_rows;
@@ -100,7 +111,7 @@
 } PartitionBlkParams;
 
 // Structure holding state variables for partition search.
-typedef struct {
+typedef struct PartitionSearchState {
   // Intra partitioning related info.
   PartitionSearchInfo *intra_part_info;
 
@@ -118,6 +129,9 @@
 
   // Array holding partition type cost.
   int tmp_partition_cost[PARTITION_TYPES];
+#if CONFIG_EXT_RECUR_PARTITIONS
+  int partition_cost_table[EXT_PARTITION_TYPES];
+#endif
 
   // Pointer to partition cost buffer
   int *partition_cost;
@@ -130,6 +144,10 @@
   // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT.
   int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT];
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  // New Simple Motion Result for PARTITION_NONE
+  SMSPartitionStats none_data;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   // Flags indicating if the corresponding partition was winner or not.
   // Used to bypass similar blocks during AB partition evaluation.
   int is_split_ctx_is_ready[2];
@@ -140,8 +158,11 @@
   int partition_none_allowed;
   int partition_rect_allowed[NUM_RECT_PARTS];
   int do_rectangular_split;
+#if !CONFIG_EXT_RECUR_PARTITIONS
   int do_square_split;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   int prune_rect_part[NUM_RECT_PARTS];
+  int is_block_splittable;
 
   // Chroma subsampling in x and y directions.
   int ss_x;
@@ -294,9 +315,9 @@
                          const MB_MODE_INFO *above_mi,
                          const MB_MODE_INFO *left_mi, const int intraonly);
 
-void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                         int mi_row, int mi_col, BLOCK_SIZE bsize,
-                         const int num_planes);
+void av1_restore_context(const AV1_COMMON *cm, MACROBLOCK *x,
+                         const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
+                         int mi_col, BLOCK_SIZE bsize, const int num_planes);
 
 void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
                       int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -338,6 +359,12 @@
                            const TileInfo *const tile_info, const int mi_row,
                            const int mi_col);
 
+#ifndef NDEBUG
+static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) {
+  return block_size_wide[bsize] == block_size_high[bsize];
+}
+#endif  // NDEBUG
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 4d67ab4..deae150 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -563,15 +563,24 @@
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-#if CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE bsize_base = get_bsize_base(xd, mbmi, plane);
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(bsize_base, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+#elif CONFIG_SDP
   const TX_SIZE plane_tx_size =
       plane ? av1_get_max_uv_txsize(mbmi->sb_type[xd->tree_type == CHROMA_PART],
                                     pd->subsampling_x, pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
 #else
+  const BLOCK_SIZE bsize_base =
+      plane ? mbmi->chroma_ref_info.bsize_base : mbmi->sb_type;
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(bsize_base, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
@@ -728,9 +737,8 @@
                                          encode_block_pass1, &args);
 }
 
-void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x,
                    RUN_TYPE dry_run) {
-  assert(bsize < BLOCK_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
 #if CONFIG_SDP
@@ -765,8 +773,24 @@
     const int subsampling_x = pd->subsampling_x;
     const int subsampling_y = pd->subsampling_y;
     if (plane && !xd->is_chroma_ref) break;
+
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
     const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, subsampling_x, subsampling_y);
+        get_mb_plane_block_size(xd, mbmi, plane, subsampling_x, subsampling_y);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+    const BLOCK_SIZE bsize_base =
+        plane ? mbmi->chroma_ref_info.bsize_base
+              : mbmi->sb_type[xd->tree_type == CHROMA_PART];
+    assert(plane_bsize ==
+           get_plane_block_size(bsize_base, subsampling_x, subsampling_y));
+    (void)bsize_base;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+#else
+    const BLOCK_SIZE bsize_base =
+        plane ? mbmi->chroma_ref_info.bsize_base : mbmi->sb_type;
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize_base, subsampling_x, subsampling_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
     assert(plane_bsize < BLOCK_SIZES_ALL);
     const int mi_width = mi_size_wide[plane_bsize];
     const int mi_height = mi_size_high[plane_bsize];
@@ -954,14 +978,13 @@
 #else
   if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
 #endif
-    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+    cfl_store_tx(xd, blk_row, blk_col, tx_size);
   }
 }
 
 void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
                                   TRELLIS_OPT_TYPE enable_optimize_b) {
-  assert(bsize < BLOCK_SIZES_ALL);
   const MACROBLOCKD *const xd = &x->e_mbd;
   if (plane && !xd->is_chroma_ref) return;
 
@@ -971,21 +994,25 @@
   ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
   ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
 #if CONFIG_SDP
-  struct encode_b_args arg = {
-    cpi,
-    x,
-    NULL,
-    &(xd->mi[0]->skip_txfm[xd->tree_type == CHROMA_PART]),
+  int8_t *skip_txfm = &(xd->mi[0]->skip_txfm[xd->tree_type == CHROMA_PART]);
 #else
-  struct encode_b_args arg = {
-    cpi, x,  NULL,    &(xd->mi[0]->skip_txfm),
-#endif
-    ta,
-    tl,
-    dry_run,
-    enable_optimize_b
-  };
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  int8_t *skip_txfm = &(xd->mi[0]->skip_txfm);
+#endif  // CONFIG_SDP
+
+  struct encode_b_args arg = { cpi, x,  NULL,    skip_txfm,
+                               ta,  tl, dry_run, enable_optimize_b };
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+  const BLOCK_SIZE plane_bsize =
+      get_mb_plane_block_size(xd, xd->mi[0], plane, ss_x, ss_y);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  assert(plane_bsize == get_plane_block_size(bsize, ss_x, ss_y));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+  (void)bsize;
+#else
+  const BLOCK_SIZE bsize_base =
+      plane ? xd->mi[0]->chroma_ref_info.bsize_base : bsize;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize_base, ss_x, ss_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
   if (enable_optimize_b) {
     av1_get_entropy_contexts(plane_bsize, pd, ta, tl);
   }
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 91b47cc..88274b7 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -63,8 +63,7 @@
   TRELLIS_OPT_TYPE enable_optimize_b;
 };
 
-void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                   RUN_TYPE dry_run);
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, RUN_TYPE dry_run);
 
 void av1_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 43ea020..24f4cde 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1447,6 +1447,9 @@
     thread_data->td->firstpass_ctx = NULL;
     av1_free_shared_coeff_buffer(&thread_data->td->shared_coeff_buf);
     av1_free_sms_tree(thread_data->td);
+#if CONFIG_EXT_RECUR_PARTITIONS
+    av1_free_sms_bufs(thread_data->td);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     aom_free(thread_data->td);
   }
 }
@@ -1965,6 +1968,9 @@
     av1_free_context_buffers(cm);
     av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
     av1_free_sms_tree(&cpi->td);
+#if CONFIG_EXT_RECUR_PARTITIONS
+    av1_free_sms_bufs(&cpi->td);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
     cpi->td.firstpass_ctx = NULL;
     alloc_compressor_data(cpi);
@@ -2068,8 +2074,8 @@
   const int use_ccso = !cm->features.coded_lossless && !cm->tiles.large_scale &&
                        cm->seq_params.enable_ccso;
   const int num_planes = av1_num_planes(cm);
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, &cm->cur_frame->buf,
-                       0, 0, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, 0, 0, 0, num_planes,
+                       NULL);
   const int ccso_stride = xd->plane[0].dst.width;
   const int ccso_stride_ext = xd->plane[0].dst.width + (CCSO_PADDING_SIZE << 1);
   for (int pli = 0; pli < 2; pli++) {
@@ -2135,8 +2141,8 @@
 
 #if CONFIG_CCSO
   if (use_ccso) {
-    av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, &cm->cur_frame->buf,
-                         0, 0, 0, num_planes);
+    av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, 0, 0, 0, num_planes,
+                         NULL);
     // Reading original and reconstructed chroma samples as input
     for (int pli = 1; pli < 3; pli++) {
       const int pic_height = xd->plane[pli].dst.height;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 71a1158..3cc21be 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1157,6 +1157,9 @@
                                      [PALETTE_COLOR_INDEX_CONTEXTS]
                                      [PALETTE_COLORS];
   unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#if CONFIG_EXT_RECUR_PARTITIONS
+  unsigned int partition_rec[PARTITION_CONTEXTS_REC][PARTITION_TYPES_REC];
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
   unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
                         [EOB_COEF_CONTEXTS][2];
@@ -1456,6 +1459,9 @@
   PC_TREE_SHARED_BUFFERS shared_coeff_buf;
   SIMPLE_MOTION_DATA_TREE *sms_tree;
   SIMPLE_MOTION_DATA_TREE *sms_root;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  struct SimpleMotionDataBufs *sms_bufs;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   InterModesInfo *inter_modes_info;
   uint32_t *hash_value_buffer[2][2];
   OBMCBuffer obmc_buffer;
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index b4291d2..e458443 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -74,8 +74,14 @@
 
   av1_setup_shared_coeff_buffer(&cpi->common, &cpi->td.shared_coeff_buf);
   av1_setup_sms_tree(cpi, &cpi->td);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  av1_setup_sms_bufs(&cpi->common, &cpi->td);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
   cpi->td.firstpass_ctx =
-      av1_alloc_pmc(cm, BLOCK_16X16, &cpi->td.shared_coeff_buf);
+      av1_alloc_pmc(cm, 0, 0, BLOCK_16X16, NULL, PARTITION_NONE, 0,
+                    cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+                    &cpi->td.shared_coeff_buf);
 }
 
 static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) {
@@ -295,6 +301,9 @@
 
   av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
   av1_free_sms_tree(&cpi->td);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  av1_free_sms_bufs(&cpi->td);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   aom_free(cpi->td.mb.palette_buffer);
   release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 3bffabd..e506d2d 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -329,14 +329,8 @@
                           int block, TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
-#if CONFIG_SDP
   const int txb_offset =
-      x->mbmi_ext_frame->cb_offset[plane > 0 && xd->tree_type == CHROMA_PART] /
-      (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-#else
-  const int txb_offset =
-      x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-#endif
+      x->mbmi_ext_frame->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
   const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
   const uint16_t eob = eob_txb[block];
   const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
@@ -430,14 +424,8 @@
   const int height = get_txb_high(tx_size);
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
-#if CONFIG_SDP
   const tran_low_t *tcoeff_txb =
-      cb_coef_buff->tcoeff[plane] +
-      x->mbmi_ext_frame->cb_offset[plane > 0 && xd->tree_type == CHROMA_PART];
-#else
-  const tran_low_t *tcoeff_txb =
-      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
-#endif
+      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane];
   const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
   av1_txb_init_levels(tcoeff, width, height, levels);
   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
@@ -1594,15 +1582,8 @@
     }
 
     CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
-#if CONFIG_SDP
     const int txb_offset =
-        x->mbmi_ext_frame
-            ->cb_offset[(plane > 0 && xd->tree_type == CHROMA_PART) ? 1 : 0] /
-        (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-#else
-    const int txb_offset =
-        x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-#endif
+        x->mbmi_ext_frame->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
     uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
     uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
     entropy_ctx[block] = txb_ctx.txb_skip_ctx;
@@ -1615,14 +1596,8 @@
     }
     const int segment_id = mbmi->segment_id;
     const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-#if CONFIG_SDP
     tran_low_t *tcoeff_txb =
-        cb_coef_buff->tcoeff[plane] +
-        x->mbmi_ext_frame->cb_offset[plane > 0 && xd->tree_type == CHROMA_PART];
-#else
-    tran_low_t *tcoeff_txb =
-        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
-#endif
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane];
     tcoeff = tcoeff_txb + block_offset;
     memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
 
@@ -1667,6 +1642,7 @@
       if (allow_update_cdf) {
         if (c == eob - 1) {
           assert(coeff_ctx < 4);
+          assert(level > 0);
           update_cdf(
               ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
               AOMMIN(level, 3) - 1, 3);
@@ -1677,6 +1653,7 @@
       }
       if (c == eob - 1) {
         assert(coeff_ctx < 4);
+        assert(level > 0);
 #if CONFIG_ENTROPY_STATS
         ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
                                           [coeff_ctx][AOMMIN(level, 3) - 1];
@@ -1743,7 +1720,10 @@
   if (mbmi->skip_txfm[xd->tree_type == CHROMA_PART]) {
 #else
   if (mbmi->skip_txfm) {
-#endif
+#endif  // CONFIG_SDP
+#if CONFIG_SDP
+    assert(bsize == mbmi->sb_type[av1_get_sdp_idx(xd->tree_type)]);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
@@ -1758,7 +1738,15 @@
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const int ss_x = pd->subsampling_x;
     const int ss_y = pd->subsampling_y;
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+    const BLOCK_SIZE plane_bsize =
+        get_mb_plane_block_size(xd, mbmi, plane, ss_x, ss_y);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+    assert(plane_bsize == get_plane_block_size(bsize, ss_x, ss_y));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+#else
     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
     av1_foreach_transformed_block_in_plane(
         xd, plane_bsize, plane, av1_update_and_record_txb_context, &arg);
   }
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 0f754b7..0e48e7e 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -550,6 +550,9 @@
     if (i > 0) {
       // Set up sms_tree.
       av1_setup_sms_tree(cpi, thread_data->td);
+#if CONFIG_EXT_RECUR_PARTITIONS
+      av1_setup_sms_bufs(cm, thread_data->td);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
       alloc_obmc_buffers(&thread_data->td->obmc_buffer, cm);
 
@@ -666,8 +669,10 @@
 
     if (i > 0) {
       // Set up firstpass PICK_MODE_CONTEXT.
-      thread_data->td->firstpass_ctx =
-          av1_alloc_pmc(cm, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
+      thread_data->td->firstpass_ctx = av1_alloc_pmc(
+          cm, 0, 0, BLOCK_16X16, NULL, PARTITION_NONE, 0,
+          cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+          &thread_data->td->shared_coeff_buf);
 
       // Create threads
       if (!winterface->reset(worker))
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 62e520b..f91c0c4 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -365,8 +365,8 @@
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
   set_mi_row_col(xd, tile, mb_row * mb_scale, mi_size_high[bsize],
                  mb_col * mb_scale, mi_size_wide[bsize], mi_params->mi_rows,
-                 mi_params->mi_cols);
-  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
+                 mi_params->mi_cols, NULL);
+  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes, NULL);
   xd->mi[0]->segment_id = 0;
   xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
   xd->mi[0]->mode = DC_PRED;
@@ -952,6 +952,8 @@
   AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
   AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
 
+  xd->tile = *tile;
+
   const YV12_BUFFER_CONFIG *const last_frame =
       get_ref_frame_yv12_buf(cm, LAST_FRAME);
   const YV12_BUFFER_CONFIG *golden_frame =
@@ -1015,7 +1017,7 @@
                         cpi->oxcf.border_in_pixels);
 
   av1_setup_src_planes(x, cpi->source, mb_row << FP_MIB_SIZE_LOG2,
-                       tile->mi_col_start, num_planes, fp_block_size);
+                       tile->mi_col_start, num_planes, NULL);
 
   // Fix - zero the 16x16 block first. This ensures correct this_intra_error for
   // block sizes smaller than 16x16.
@@ -1133,12 +1135,11 @@
   av1_setup_block_planes(xd, seq_params->subsampling_x,
                          seq_params->subsampling_y, num_planes);
 
-  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size);
-  av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0,
-                       num_planes);
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, NULL);
+  av1_setup_dst_planes(xd->plane, this_frame, 0, 0, 0, num_planes, NULL);
 
   if (!frame_is_intra_only(cm)) {
-    av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes);
+    av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes, NULL);
   }
 
   set_mi_offsets(mi_params, xd, 0, 0);
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index b40def7..9a48c09 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -238,7 +238,7 @@
     int *best_angle_delta, int64_t *best_rd) {
   MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
 #if CONFIG_SDP
-  assert(!is_inter_block(mbmi, x->e_mbd.tree_type));
+  assert(!is_inter_block(mbmi, cpi->td.mb.e_mbd.tree_type));
 #else
   assert(!is_inter_block(mbmi));
 #endif
@@ -246,8 +246,7 @@
   int64_t this_rd;
   RD_STATS tokenonly_rd_stats;
 
-  if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
-    return INT64_MAX;
+  if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, best_rd_in)) return INT64_MAX;
   this_rate = tokenonly_rd_stats.rate +
               intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
@@ -338,19 +337,16 @@
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const MACROBLOCKD_PLANE *pd = &xd->plane[AOM_PLANE_U];
   const ModeCosts *mode_costs = &x->mode_costs;
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
 #if CONFIG_SDP
   assert(xd->tree_type != LUMA_PART);
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(
-      mbmi->sb_type[PLANE_TYPE_UV], pd->subsampling_x, pd->subsampling_y);
-#else
+#endif  // CONFIG_SDP
+  const BLOCK_SIZE plane_bsize = get_mb_plane_block_size(
+      xd, mbmi, PLANE_TYPE_UV, pd->subsampling_x, pd->subsampling_y);
+#else   // !CONFIG_SDP && ! CONFIG_EXT_RECUR_PARTITIONS
   const BLOCK_SIZE plane_bsize =
-#if CONFIG_SDP
-      get_plane_block_size(mbmi->sb_type[xd->tree_type == CHROMA_PART],
-                           pd->subsampling_x, pd->subsampling_y);
-#else
       get_plane_block_size(mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
-#endif
-#endif
+#endif  // CONFIG_SDP
 
   assert(is_cfl_allowed(xd) && cpi->oxcf.intra_mode_cfg.enable_cfl_intra);
   assert(plane_bsize < BLOCK_SIZES_ALL);
@@ -499,11 +495,6 @@
   if (xd->tree_type == SHARED_PART) {
 #endif
     if (xd->cfl.store_y) {
-      // Restore reconstructed luma values.
-      // TODO(chiyotsai@google.com): right now we are re-computing the txfm in
-      // this function everytime we search through uv modes. There is some
-      // potential speed up here if we cache the result to avoid redundant
-      // computation.
 #if CONFIG_SDP
       av1_encode_intra_block_plane(cpi, x, mbmi->sb_type[PLANE_TYPE_Y],
                                    AOM_PLANE_Y, DRY_RUN_NORMAL,
@@ -512,7 +503,7 @@
     av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
                                  DRY_RUN_NORMAL,
                                  cpi->optimize_seg_arr[mbmi->segment_id]);
-#endif
+#endif  // CONFIG_SDP
       xd->cfl.store_y = 0;
     }
 #if CONFIG_SDP
@@ -562,7 +553,7 @@
         continue;
     } else {
       // Predict directly if we don't need to search for angle delta.
-      if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+      if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, best_rd)) {
         continue;
       }
     }
@@ -918,6 +909,17 @@
       best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
       try_filter_intra = (best_rd_so_far / 2) <= best_rd;
     }
+#if CONFIG_EXT_RECUR_PARTITIONS
+    const MB_MODE_INFO *cached_mode = x->inter_mode_cache;
+    const FILTER_INTRA_MODE_INFO *cached_fi_mode =
+        cached_mode ? &cached_mode->filter_intra_mode_info : NULL;
+    if (should_reuse_mode(x, REUSE_INTRA_MODE_IN_INTERFRAME_FLAG) &&
+        !frame_is_intra_only(cm) && cached_fi_mode &&
+        !cached_fi_mode->use_filter_intra) {
+      // assert(cached_mode->mode == DC_PRED);
+      try_filter_intra = 0;
+    }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
     if (try_filter_intra) {
       handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost,
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 8601b55..ad0f4d7 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -1912,7 +1912,7 @@
     // motion search code to be used without additional modifications.
     for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
     av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
-                         MAX_MB_PLANE);
+                         MAX_MB_PLANE, NULL);
   }
 
   if (xd->bd != 8) {
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 2519cc8..7d892f1 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -58,18 +58,6 @@
 struct AV1_COMP;
 struct SPEED_FEATURES;
 
-// =============================================================================
-//  Cost functions
-// =============================================================================
-
-enum {
-  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
-  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
-  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
-  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
-  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
-} UENUM1BYTE(MV_COST_TYPE);
-
 typedef struct {
   // The reference mv used to compute the mv cost
   const MV *ref_mv;
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index ca271ad..e33b577 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -72,7 +72,7 @@
       backup_yv12[i] = xd->plane[i].pre[ref_idx];
     }
     av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
+                         num_planes, &mbmi->chroma_ref_info);
   }
 
   // Work out the size of the first step in the mv step search.
@@ -385,7 +385,7 @@
         for (i = 0; i < num_planes; i++)
           backup_yv12[ref][i] = xd->plane[i].pre[ref];
         av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
-                             NULL, num_planes);
+                             NULL, num_planes, &mbmi->chroma_ref_info);
       }
     }
 
@@ -532,7 +532,7 @@
     const int mi_row = xd->mi_row;
     const int mi_col = xd->mi_col;
     av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
+                         num_planes, &mbmi->chroma_ref_info);
   }
 
   int bestsme = INT_MAX;
@@ -764,12 +764,12 @@
   int_mv best_mv;
 
   av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
-                       get_ref_scale_factors(cm, ref), num_planes);
+                       get_ref_scale_factors(cm, ref), num_planes, NULL);
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   if (scaled_ref_frame) {
     backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
     av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
+                         num_planes, NULL);
   }
 
   // Allow more mesh searches for screen content type on the ARF.
@@ -843,3 +843,120 @@
 
   return best_mv;
 }
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                     MACROBLOCK *const x, int mi_row, int mi_col,
+                     BLOCK_SIZE bsize, const CHROMA_REF_INFO *chr_ref_info);
+int_mv av1_simple_motion_search_ext(AV1_COMP *const cpi,
+                                    const TileInfo *const tile, MACROBLOCK *x,
+                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                    int ref, FULLPEL_MV start_mv,
+                                    int num_planes, int use_subpixel,
+                                    SimpleMotionData *sms_data) {
+  assert(num_planes == 1 &&
+         "Currently simple_motion_search only supports luma plane");
+  assert(!frame_is_intra_only(&cpi->common) &&
+         "Simple motion search only enabled for non-key frames");
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  // TODO(debargha,chiyotsai): Can we use set_offsets_for_motion_search()
+  av1_set_offsets(cpi, tile, x, mi_row, mi_col, bsize, NULL);
+  // set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  MB_MODE_INFO *mbmi = xd->mi[0];
+#if CONFIG_SDP
+  mbmi->sb_type[0] = mbmi->sb_type[1] = bsize;
+#else
+  mbmi->sb_type = bsize;
+#endif  // CONFIG_SDP
+  mbmi->ref_frame[0] = ref;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+#if CONFIG_REMOVE_DUAL_FILTER
+  mbmi->interp_fltr = EIGHTTAP_REGULAR;
+#else
+  mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+#endif  // CONFIG_REMOVE_DUAL_FILTER
+
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  struct buf_2d backup_yv12;
+  // ref_mv is used to calculate the cost of the motion vector
+  const MV ref_mv = kZeroMv;
+  const int step_param =
+      AOMMIN(cpi->mv_search_params.mv_step_param +
+                 cpi->sf.part_sf.simple_motion_search_reduce_search_steps,
+             MAX_MVSEARCH_STEPS - 2);
+  const search_site_config *src_search_sites =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
+  int cost_list[5];
+  const int ref_idx = 0;
+  int var;
+  int_mv best_mv;
+
+  av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+                       get_ref_scale_factors(cm, ref), num_planes, NULL);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  if (scaled_ref_frame) {
+    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes, NULL);
+  }
+
+  // Allow more mesh searches for screen content type on the ARF.
+  const int fine_search_interval = use_fine_search_interval(cpi);
+  sms_data->sadpb = x->mv_costs.sadperbit;
+  sms_data->errorperbit = x->mv_costs.errorperbit;
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
+                                     src_search_sites, fine_search_interval);
+
+  var = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                              cond_cost_list(cpi, cost_list),
+                              &best_mv.as_fullmv, NULL);
+
+  sms_data->fullmv = best_mv.as_mv;
+  const int use_subpel_search =
+      var < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
+      use_subpixel;
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+  if (use_subpel_search) {
+    int not_used = 0;
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params().
+    ms_params.forced_stop = cpi->sf.mv_sf.simple_motion_subpel_force_stop;
+
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, subpel_start_mv, &best_mv.as_mv, &not_used,
+        &x->pred_sse[ref], NULL);
+  } else {
+    // Manually convert from units of pixel to 1/8-pixels if we are not doing
+    // subpel search
+    convert_fullmv_to_mv(&best_mv);
+  }
+  sms_data->submv = best_mv.as_mv;
+  mbmi->mv[0] = best_mv;
+
+  // Get a copy of the prediction output
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+
+  aom_clear_system_state();
+
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+
+  return best_mv;
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index e631c4e..c671211 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -70,7 +70,14 @@
                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
                                  const FULLPEL_MV start_mv, int use_subpixel,
                                  unsigned int *sse, unsigned int *var);
-
+#if CONFIG_EXT_RECUR_PARTITIONS
+int_mv av1_simple_motion_search_ext(AV1_COMP *const cpi,
+                                    const TileInfo *const tile, MACROBLOCK *x,
+                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                    int ref, FULLPEL_MV start_mv,
+                                    int num_planes, int use_subpixel,
+                                    SimpleMotionData *sms_data);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/mv_prec.c b/av1/encoder/mv_prec.c
index a923533..50d7c40 100644
--- a/av1/encoder/mv_prec.c
+++ b/av1/encoder/mv_prec.c
@@ -276,74 +276,126 @@
 }
 
 // Split block
+#if CONFIG_EXT_RECUR_PARTITIONS
+static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
+                                           const AV1_COMP *cpi, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize,
+                                           PARTITION_TREE *ptree) {
+#else
 static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
                                            const AV1_COMP *cpi, int mi_row,
                                            int mi_col, BLOCK_SIZE bsize) {
+#endif  // EXT_RECUR_PARTITIONS
   assert(bsize < BLOCK_SIZES_ALL);
   const AV1_COMMON *cm = &cpi->common;
 
   if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
     return;
-#if CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const PARTITION_TYPE partition = ptree->partition;
+#elif CONFIG_SDP
   const PARTITION_TYPE partition =
       get_partition(cm, SHARED_PART, mi_row, mi_col, bsize);
 #else
-  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
 #endif
+
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
 
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int qbs = mi_size_wide[bsize] / 4;
+  const int hbs_w = mi_size_wide[bsize] / 2;
+  const int hbs_h = mi_size_high[bsize] / 2;
+  const int qbs_w = mi_size_wide[bsize] / 4;
+  const int qbs_h = mi_size_high[bsize] / 4;
   switch (partition) {
     case PARTITION_NONE:
       collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
       break;
     case PARTITION_HORZ:
+#if CONFIG_EXT_RECUR_PARTITIONS
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize,
+                          ptree->sub_tree[0]);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs_h, mi_col, subsize,
+                          ptree->sub_tree[1]);
+#else
       collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
-      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs_h, mi_col);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       break;
     case PARTITION_VERT:
+#if CONFIG_EXT_RECUR_PARTITIONS
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize,
+                          ptree->sub_tree[0]);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs_w, subsize,
+                          ptree->sub_tree[1]);
+#else
       collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
-      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs_w);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       break;
+#if !CONFIG_EXT_RECUR_PARTITIONS
     case PARTITION_SPLIT:
       collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize);
-      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize);
-      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize);
-      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs_w, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs_h, mi_col, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs_h, mi_col + hbs_w,
+                          subsize);
       break;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_3: {
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize,
+                          ptree->sub_tree[0]);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + qbs_h, mi_col,
+                          get_partition_subsize(bsize, PARTITION_HORZ),
+                          ptree->sub_tree[1]);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + 3 * qbs_h, mi_col, subsize,
+                          ptree->sub_tree[2]);
+      break;
+    }
+    case PARTITION_VERT_3: {
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize,
+                          ptree->sub_tree[0]);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + qbs_w,
+                          get_partition_subsize(bsize, PARTITION_VERT),
+                          ptree->sub_tree[1]);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + 3 * qbs_w, subsize,
+                          ptree->sub_tree[2]);
+      break;
+    }
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     case PARTITION_HORZ_A:
       collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
-      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
-      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs_w);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs_h, mi_col);
       break;
     case PARTITION_HORZ_B:
       collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
-      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
-      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs_h, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs_h, mi_col + hbs_w);
       break;
     case PARTITION_VERT_A:
       collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
-      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
-      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs_h, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs_w);
       break;
     case PARTITION_VERT_B:
       collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
-      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
-      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs_w);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs_h, mi_col + hbs_w);
       break;
     case PARTITION_HORZ_4:
       for (int i = 0; i < 4; ++i) {
-        const int this_mi_row = mi_row + i * qbs;
+        const int this_mi_row = mi_row + i * qbs_h;
         collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col);
       }
       break;
     case PARTITION_VERT_4:
       for (int i = 0; i < 4; ++i) {
-        const int this_mi_col = mi_col + i * qbs;
+        const int this_mi_col = mi_col + i * qbs_w;
         collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col);
       }
       break;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     default: assert(0);
   }
 }
@@ -360,7 +412,18 @@
   BLOCK_SIZE sb_size = cm->seq_params.sb_size;
   for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
     for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+      const SB_INFO *sb_info = av1_get_sb_info(cm, mi_row, mi_col);
+#if CONFIG_SDP
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size,
+                          sb_info->ptree_root[0]);
+#else
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size,
+                          sb_info->ptree_root);
+#endif  // CONFIG_SDP
+#else
       collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
+#endif  // EXT_RECUR_PARTITIONS
     }
   }
 }
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index fb483d0..e6af1dc 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -792,7 +792,7 @@
         }
       }
 
-      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, *best_rd);
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
       this_rate = tokenonly_rd_stats.rate +
                   intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 947dca1..7ca6495 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -12,11 +12,13 @@
 #include "aom_ports/system_state.h"
 
 #include "av1/common/blockd.h"
+#include "av1/common/common_data.h"
 #include "av1/common/enums.h"
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_variance.h"
+#include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodeframe.h"
@@ -24,6 +26,7 @@
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_search.h"
+#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tokenize.h"
 
@@ -383,7 +386,7 @@
     for (int plane = plane_start; plane < plane_end; ++plane) {
 #else
     for (int plane = 0; plane < num_planes; ++plane) {
-#endif
+#endif  // CONFIG_SDP
       av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
                                    cpi->optimize_seg_arr[mbmi->segment_id]);
     }
@@ -435,7 +438,8 @@
       assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
 #endif
       av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                           xd->block_ref_scale_factors[ref], num_planes);
+                           xd->block_ref_scale_factors[ref], num_planes,
+                           &mbmi->chroma_ref_info);
     }
     int start_plane = 0;
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
@@ -452,9 +456,7 @@
         int pixel_c, pixel_r;
         mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
                         pd->subsampling_x, pd->subsampling_y);
-        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                 pd->subsampling_y))
-          continue;
+        if (plane && !xd->is_chroma_ref) continue;
         mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
                                   cm->current_frame.order_hint, plane, pixel_c,
                                   pixel_r, pd->width, pd->height,
@@ -465,7 +467,7 @@
     (void)num_planes;
 #endif
 
-    av1_encode_sb(cpi, x, bsize, dry_run);
+    av1_encode_sb(cpi, x, dry_run);
     av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
                           tile_data->allow_update_cdf);
   }
@@ -616,11 +618,13 @@
     }
   }
 #endif
+
+  av1_mark_block_as_coded(xd, bsize, cm->seq_params.sb_size);
 }
 
-static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               int mi_row, int mi_col, BLOCK_SIZE bsize,
-                               AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
+void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                        int mi_row, int mi_col, BLOCK_SIZE bsize,
+                        AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
   x->rdmult = cpi->rd.RDMULT;
 #if CONFIG_SDP
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -667,7 +671,8 @@
 void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
                                         const TileInfo *const tile,
                                         MACROBLOCK *const x, int mi_row,
-                                        int mi_col, BLOCK_SIZE bsize) {
+                                        int mi_col, BLOCK_SIZE bsize,
+                                        const CHROMA_REF_INFO *chr_ref_info) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -678,29 +683,32 @@
   set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
                         mi_row, mi_col);
 
-  set_entropy_context(xd, mi_row, mi_col, num_planes);
+  set_entropy_context(xd, mi_row, mi_col, num_planes, chr_ref_info);
   xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   // Set up destination pointers.
-  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
-                       num_planes);
+  av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes, chr_ref_info);
 
   // Set up limit values for MV components.
   // Mv beyond the range do not produce new/different prediction block.
   av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
                     mi_width, cpi->oxcf.border_in_pixels);
 
-  set_plane_n4(xd, mi_width, mi_height, num_planes);
+  set_plane_n4(xd, mi_width, mi_height, num_planes, chr_ref_info);
 
   // Set up distance of MB to edge of frame in 1/8th pel units.
+#if !CONFIG_EXT_RECUR_PARTITIONS
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
-                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols, chr_ref_info);
 
   // Set up source buffers.
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes,
+                       chr_ref_info);
 
   // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
   xd->tile = *tile;
@@ -708,13 +716,14 @@
 
 void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
                      MACROBLOCK *const x, int mi_row, int mi_col,
-                     BLOCK_SIZE bsize) {
+                     BLOCK_SIZE bsize, const CHROMA_REF_INFO *chr_ref_info) {
   const AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
 
-  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize,
+                                     chr_ref_info);
 
   // Setup segment ID.
   mbmi = xd->mi[0];
@@ -780,9 +789,14 @@
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   int plane_type = (xd->tree_type == CHROMA_PART);
+  assert(bsize < BLOCK_SIZES_ALL);
+  assert(IMPLIES(xd->tree_type == CHROMA_PART,
+                 AOMMIN(block_size_wide[bsize], block_size_high[bsize]) > 4));
 #endif
+  assert(is_bsize_geq(bsize, cpi->common.mi_params.mi_alloc_bsize));
 
-  av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+  av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize,
+                  &ctx->chroma_ref_info);
 
   if (ctx->rd_mode_is_ready) {
 #if CONFIG_SDP
@@ -824,6 +838,7 @@
   mbmi->sb_type = bsize;
 #endif
   mbmi->partition = partition;
+  mbmi->chroma_ref_info = ctx->chroma_ref_info;
 
 #if CONFIG_RD_DEBUG
   mbmi->mi_row = mi_row;
@@ -1440,13 +1455,15 @@
 static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                      ThreadData *td, TokenExtra **tp, int mi_row, int mi_col,
                      RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                     PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx,
-                     int *rate) {
+                     PARTITION_TYPE partition,
+                     const PICK_MODE_CONTEXT *const ctx, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
   TileInfo *const tile = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
 
-  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize,
+                                     &ctx->chroma_ref_info);
   const int origin_mult = x->rdmult;
   setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
   MB_MODE_INFO *mbmi = xd->mi[0];
@@ -1454,36 +1471,47 @@
   av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
 
 #if CONFIG_SDP
-  int plane_type = (xd->tree_type == CHROMA_PART);
-#endif
+  const int num_planes = av1_num_planes(cm);
+  const int plane_start = (xd->tree_type == CHROMA_PART);
+  const int plane_end = (xd->tree_type == LUMA_PART) ? 1 : num_planes;
+#endif  // CONFIG_SDP
 
   if (!dry_run) {
 #if CONFIG_SDP
-    x->mbmi_ext_frame->cb_offset[plane_type] = x->cb_offset[plane_type];
-    assert(x->cb_offset[plane_type] <
-           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+    for (int plane = plane_start; plane < plane_end; plane++) {
+      x->mbmi_ext_frame->cb_offset[plane] = x->cb_offset[plane];
+      assert(x->cb_offset[plane] <
+             (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+    }
 #else
-    x->mbmi_ext_frame->cb_offset = x->cb_offset;
-    assert(x->cb_offset <
+    memcpy(x->mbmi_ext_frame->cb_offset, x->cb_offset, sizeof(x->cb_offset));
+    assert(x->cb_offset[0] <
            (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
-#endif
+#endif  // CONFIG_SDP
   }
 
   encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
 
   if (!dry_run) {
-    const AV1_COMMON *const cm = &cpi->common;
 #if CONFIG_SDP
-    x->cb_offset[plane_type] += block_size_wide[bsize] * block_size_high[bsize];
+    for (int plane = plane_start; plane < plane_end; ++plane) {
 #else
-    x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
-#endif
+    for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#endif  // CONFIG_SDP
+      if (plane == 0) {
+        x->cb_offset[plane] += block_size_wide[bsize] * block_size_high[bsize];
+      } else if (xd->is_chroma_ref) {
+        const BLOCK_SIZE bsize_base = mbmi->chroma_ref_info.bsize_base;
+        x->cb_offset[plane] +=
+            block_size_wide[bsize_base] * block_size_high[bsize_base];
+      }
+    }
 #if CONFIG_SDP
     if (bsize == cpi->common.seq_params.sb_size &&
         mbmi->skip_txfm[xd->tree_type == CHROMA_PART] == 1 &&
 #else
     if (bsize == cpi->common.seq_params.sb_size && mbmi->skip_txfm == 1 &&
-#endif
+#endif  // CONFIG_SDP
         cm->delta_q_info.delta_lf_present_flag) {
       const int frame_lf_count =
           av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
@@ -1605,6 +1633,44 @@
   x->rdmult = origin_mult;
 }
 
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+/*!\brief Reconstructs a partition (may contain multiple coding blocks)
+ *
+ * \ingroup partition_search
+ * Reconstructs a sub-partition of the superblock by applying the chosen modes
+ * and partition trees stored in pc_tree.
+ *
+ * \param[in]    cpi        Top-level encoder structure
+ * \param[in]    td         Pointer to thread data
+ * \param[in]    tile_data  Pointer to struct holding adaptive
+ *                          data/contexts/models for the tile during encoding
+ * \param[in]    tp         Pointer to the starting token
+ * \param[in]    mi_row     Row coordinate of the block in a step size of
+ *                          MI_SIZE
+ * \param[in]    mi_col     Column coordinate of the block in a step size of
+ *                          MI_SIZE
+ * \param[in]    dry_run    A code indicating whether it is part of the final
+ *                          pass for reconstructing the superblock
+ * \param[in]    bsize      Current block size
+ * \param[in]    pc_tree    Pointer to the PC_TREE node storing the picked
+ *                          partitions and mode info for the current block
+ * \param[in]    ptree      Pointer to the PARTITION_TREE node holding the
+ *                          partition info for the current node and all of its
+ *                          descendants.
+ * \param[in]    ptree_luma Pointer to the luma partition tree so that the
+ *                          encoder to estimate the
+ *                          partition type for chroma.
+ * \param[in]     rate      Pointer to the total rate for the current block
+ *
+ * \return Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd.
+ */
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+                      TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                      int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                      const PC_TREE *pc_tree, PARTITION_TREE *ptree,
+                      PARTITION_TREE *ptree_luma, int *rate) {
+#else
 /*!\brief Reconstructs a partition (may contain multiple coding blocks)
  *
  * \ingroup partition_search
@@ -1624,6 +1690,9 @@
  * \param[in]    bsize     Current block size
  * \param[in]    pc_tree   Pointer to the PC_TREE node storing the picked
  *                         partitions and mode info for the current block
+ * \param[in]    ptree     Pointer to the PARTITION_TREE node holding the
+ *                         partition info for the current node and all of its
+ *                         descendants.
  * \param[in]    rate      Pointer to the total rate for the current block
  *
  * \return Nothing is returned. Instead, reconstructions (w/o in-loop filters)
@@ -1632,133 +1701,325 @@
 static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
                       TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
                       int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                      PC_TREE *pc_tree, int *rate) {
+                      const PC_TREE *pc_tree, PARTITION_TREE *ptree,
+                      int *rate) {
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
   assert(bsize < BLOCK_SIZES_ALL);
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   assert(bsize < BLOCK_SIZES_ALL);
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int is_partition_root = bsize >= BLOCK_8X8;
+  const int hbs_w = mi_size_wide[bsize] / 2;
+  const int hbs_h = mi_size_high[bsize] / 2;
+  const int qbs_w = mi_size_wide[bsize] / 4;
+  const int qbs_h = mi_size_high[bsize] / 4;
+  const int is_partition_root = is_partition_point(bsize);
   const int ctx = is_partition_root
                       ? partition_plane_context(xd, mi_row, mi_col, bsize)
                       : -1;
   const PARTITION_TYPE partition = pc_tree->partitioning;
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  int quarter_step = mi_size_wide[bsize] / 4;
-  int i;
-  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
   if (subsize == BLOCK_INVALID) return;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  assert(partition != PARTITION_SPLIT);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   if (!dry_run && ctx >= 0) {
-    const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
-    const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
+    const int has_rows = (mi_row + hbs_h) < mi_params->mi_rows;
+    const int has_cols = (mi_col + hbs_w) < mi_params->mi_cols;
 
 #if CONFIG_SDP
     const int plane_index = xd->tree_type == CHROMA_PART;
-#endif
-
-    if (has_rows && has_cols) {
+#endif  // CONFIG_SDP
+#if CONFIG_EXT_RECUR_PARTITIONS
+    if (is_square_block(bsize)) {
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      if (has_rows && has_cols) {
 #if CONFIG_ENTROPY_STATS
-      td->counts->partition[ctx][partition]++;
+        td->counts->partition[ctx][partition]++;
 #endif
-
-      if (tile_data->allow_update_cdf) {
-        FRAME_CONTEXT *fc = xd->tile_ctx;
+        if (tile_data->allow_update_cdf) {
+          FRAME_CONTEXT *fc = xd->tile_ctx;
 #if CONFIG_SDP
-        int luma_split_flag = 0;
-        int parent_block_width = block_size_wide[bsize];
-        if (xd->tree_type == CHROMA_PART &&
-            parent_block_width >= SHARED_PART_SIZE) {
-          luma_split_flag =
-              get_luma_split_flag(bsize, mi_params, mi_row, mi_col);
-        }
-        if (luma_split_flag <= 3) {
-          update_cdf(fc->partition_cdf[plane_index][ctx], partition,
-                     partition_cdf_length(bsize));
-        } else {
-          // if luma blocks uses smaller blocks, then chroma will also split
-          assert(partition == PARTITION_SPLIT);
-        }
+          int parent_block_width = block_size_wide[bsize];
+#if CONFIG_EXT_RECUR_PARTITIONS
+          const int min_bsize_1d =
+              AOMMIN(block_size_high[bsize], parent_block_width);
+          if (xd->tree_type == CHROMA_PART && ptree_luma &&
+              min_bsize_1d >= SHARED_PART_SIZE) {
+            const int ss_x = xd->plane[1].subsampling_x;
+            const int ss_y = xd->plane[1].subsampling_y;
+            PARTITION_TYPE derived_partition_mode = sdp_chroma_part_from_luma(
+                bsize, ptree_luma->partition, ss_x, ss_y);
+            if (partition != derived_partition_mode)
+              assert(0 && "Chroma partition does not match the derived mode.");
+          } else {
+            update_cdf(fc->partition_cdf[plane_index][ctx], partition,
+                       partition_cdf_length(bsize));
+          }
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+          int luma_split_flag = 0;
+          if (xd->tree_type == CHROMA_PART &&
+              parent_block_width >= SHARED_PART_SIZE) {
+            luma_split_flag =
+                get_luma_split_flag(bsize, mi_params, mi_row, mi_col);
+          }
+          if (luma_split_flag <= 3) {
+            update_cdf(fc->partition_cdf[plane_index][ctx], partition,
+                       partition_cdf_length(bsize));
+          } else {
+            // if luma blocks uses smaller blocks, then chroma will also split
+            assert(partition == PARTITION_SPLIT);
+          }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 #else
         update_cdf(fc->partition_cdf[ctx], partition,
                    partition_cdf_length(bsize));
-#endif
+#endif  // CONFIG_SDP
+        }
       }
+#if CONFIG_EXT_RECUR_PARTITIONS
+    } else {
+#if CONFIG_SDP
+      int parent_block_width = block_size_wide[bsize];
+      const int min_bsize_1d =
+          AOMMIN(block_size_high[bsize], parent_block_width);
+      if (xd->tree_type == CHROMA_PART && ptree_luma &&
+          min_bsize_1d >= SHARED_PART_SIZE) {
+        const int ss_x = xd->plane[1].subsampling_x;
+        const int ss_y = xd->plane[1].subsampling_y;
+        PARTITION_TYPE derived_partition_mode =
+            sdp_chroma_part_from_luma(bsize, ptree_luma->partition, ss_x, ss_y);
+        assert(partition == derived_partition_mode);
+        (void)derived_partition_mode;
+      } else {
+#endif
+        const PARTITION_TYPE_REC p_rec =
+            get_symbol_from_partition_rec_block(bsize, partition);
+#if CONFIG_ENTROPY_STATS
+        td->counts->partition_rec[ctx][p_rec]++;
+#endif  // CONFIG_ENTROPY_STATS
+
+        if (tile_data->allow_update_cdf) {
+          FRAME_CONTEXT *fc = xd->tile_ctx;
+          update_cdf(fc->partition_rec_cdf[ctx], p_rec,
+                     partition_rec_cdf_length(bsize));
+        }
+#if CONFIG_SDP
+      }
+#endif
     }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   }
 
+  PARTITION_TREE *sub_tree[4] = { NULL, NULL, NULL, NULL };
+  if (!dry_run) {
+    assert(ptree);
+
+    ptree->partition = partition;
+    ptree->bsize = bsize;
+    ptree->mi_row = mi_row;
+    ptree->mi_col = mi_col;
+    PARTITION_TREE *parent = ptree->parent;
+    const int ss_x = xd->plane[1].subsampling_x;
+    const int ss_y = xd->plane[1].subsampling_y;
+    set_chroma_ref_info(
+        mi_row, mi_col, ptree->index, bsize, &ptree->chroma_ref_info,
+        parent ? &parent->chroma_ref_info : NULL,
+        parent ? parent->bsize : BLOCK_INVALID,
+        parent ? parent->partition : PARTITION_NONE, ss_x, ss_y);
+
+    switch (partition) {
+      case PARTITION_SPLIT:
+        ptree->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+        ptree->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+        ptree->sub_tree[2] = av1_alloc_ptree_node(ptree, 2);
+        ptree->sub_tree[3] = av1_alloc_ptree_node(ptree, 3);
+        break;
+#if CONFIG_EXT_RECUR_PARTITIONS
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        ptree->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+        ptree->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+        break;
+      case PARTITION_HORZ_3:
+      case PARTITION_VERT_3:
+        ptree->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+        ptree->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+        ptree->sub_tree[2] = av1_alloc_ptree_node(ptree, 2);
+        break;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      default: break;
+    }
+    for (int i = 0; i < 4; ++i) sub_tree[i] = ptree->sub_tree[i];
+  }
+
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  const int min_bsize_1d =
+      AOMMIN(block_size_high[bsize], block_size_wide[bsize]);
+  const int track_ptree_luma = xd->tree_type && ptree_luma &&
+                               ptree_luma->partition == partition &&
+                               min_bsize_1d >= SHARED_PART_SIZE;
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
   switch (partition) {
     case PARTITION_NONE:
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
                partition, pc_tree->none, rate);
       break;
     case PARTITION_VERT:
+#if CONFIG_EXT_RECUR_PARTITIONS
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->vertical[0], sub_tree[0],
+#if CONFIG_SDP
+                track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+#endif  // CONFIG_SDP
+                rate);
+      if (mi_col + hbs_w < cm->mi_params.mi_cols) {
+        encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs_w, dry_run,
+                  subsize, pc_tree->vertical[1], sub_tree[1],
+#if CONFIG_SDP
+                  track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+#endif  // CONFIG_SDP
+                  rate);
+      }
+#else   // CONFIG_EXT_RECUR_PARTITIONS
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
                partition, pc_tree->vertical[0], rate);
-      if (mi_col + hbs < mi_params->mi_cols) {
-        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-                 partition, pc_tree->vertical[1], rate);
+      if (mi_col + hbs_w < mi_params->mi_cols) {
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs_w, dry_run,
+                 subsize, partition, pc_tree->vertical[1], rate);
       }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       break;
     case PARTITION_HORZ:
+#if CONFIG_EXT_RECUR_PARTITIONS
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->horizontal[0], sub_tree[0],
+#if CONFIG_SDP
+                track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+#endif  // CONFIG_SDP
+                rate);
+      if (mi_row + hbs_h < cm->mi_params.mi_rows) {
+        encode_sb(cpi, td, tile_data, tp, mi_row + hbs_h, mi_col, dry_run,
+                  subsize, pc_tree->horizontal[1], sub_tree[1],
+#if CONFIG_SDP
+                  track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+#endif  // CONFIG_SDP
+                  rate);
+      }
+#else   // CONFIG_EXT_RECUR_PARTITIONS
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
                partition, pc_tree->horizontal[0], rate);
-      if (mi_row + hbs < mi_params->mi_rows) {
-        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-                 partition, pc_tree->horizontal[1], rate);
+      if (mi_row + hbs_h < mi_params->mi_rows) {
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs_h, mi_col, dry_run,
+                 subsize, partition, pc_tree->horizontal[1], rate);
       }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       break;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_3: {
+      const BLOCK_SIZE bsize3 = get_partition_subsize(bsize, PARTITION_HORZ);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->horizontal3[0], sub_tree[0],
+#if CONFIG_SDP
+                track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+#endif  // CONFIG_SDP
+                rate);
+      if (mi_row + qbs_h >= cm->mi_params.mi_rows) break;
+      encode_sb(cpi, td, tile_data, tp, mi_row + qbs_h, mi_col, dry_run, bsize3,
+                pc_tree->horizontal3[1], sub_tree[1],
+#if CONFIG_SDP
+                track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+#endif  // CONFIG_SDP
+                rate);
+      if (mi_row + 3 * qbs_h >= cm->mi_params.mi_rows) break;
+      encode_sb(cpi, td, tile_data, tp, mi_row + 3 * qbs_h, mi_col, dry_run,
+                subsize, pc_tree->horizontal3[2], sub_tree[2],
+#if CONFIG_SDP
+                track_ptree_luma ? ptree_luma->sub_tree[2] : NULL,
+#endif  // CONFIG_SDP
+                rate);
+      break;
+    }
+    case PARTITION_VERT_3: {
+      const BLOCK_SIZE bsize3 = get_partition_subsize(bsize, PARTITION_VERT);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->vertical3[0], sub_tree[0],
+#if CONFIG_SDP
+                track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+#endif  // CONFIG_SDP
+                rate);
+      if (mi_col + qbs_w >= cm->mi_params.mi_cols) break;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + qbs_w, dry_run, bsize3,
+                pc_tree->vertical3[1], sub_tree[1],
+#if CONFIG_SDP
+                track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+#endif  // CONFIG_SDP
+                rate);
+      if (mi_col + 3 * qbs_w >= cm->mi_params.mi_cols) break;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + 3 * qbs_w, dry_run,
+                subsize, pc_tree->vertical3[2], sub_tree[2],
+#if CONFIG_SDP
+                track_ptree_luma ? ptree_luma->sub_tree[2] : NULL,
+#endif  // CONFIG_SDP
+                rate);
+      break;
+    }
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     case PARTITION_SPLIT:
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
-                pc_tree->split[0], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
-                pc_tree->split[1], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
-                pc_tree->split[2], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
-                subsize, pc_tree->split[3], rate);
+                pc_tree->split[0], sub_tree[0], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs_w, dry_run,
+                subsize, pc_tree->split[1], sub_tree[1], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs_h, mi_col, dry_run,
+                subsize, pc_tree->split[2], sub_tree[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs_h, mi_col + hbs_w, dry_run,
+                subsize, pc_tree->split[3], sub_tree[3], rate);
       break;
-
     case PARTITION_HORZ_A:
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
                partition, pc_tree->horizontala[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs_w, dry_run, bsize2,
                partition, pc_tree->horizontala[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs_h, mi_col, dry_run, subsize,
                partition, pc_tree->horizontala[2], rate);
       break;
     case PARTITION_HORZ_B:
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
                partition, pc_tree->horizontalb[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs_h, mi_col, dry_run, bsize2,
                partition, pc_tree->horizontalb[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs_h, mi_col + hbs_w, dry_run,
                bsize2, partition, pc_tree->horizontalb[2], rate);
       break;
     case PARTITION_VERT_A:
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
                partition, pc_tree->verticala[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs_h, mi_col, dry_run, bsize2,
                partition, pc_tree->verticala[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs_w, dry_run, subsize,
                partition, pc_tree->verticala[2], rate);
 
       break;
     case PARTITION_VERT_B:
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
                partition, pc_tree->verticalb[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs_w, dry_run, bsize2,
                partition, pc_tree->verticalb[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs_h, mi_col + hbs_w, dry_run,
                bsize2, partition, pc_tree->verticalb[2], rate);
       break;
     case PARTITION_HORZ_4:
-      for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
+      for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+        int this_mi_row = mi_row + i * qbs_h;
         if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
 
         encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
@@ -1766,19 +2027,87 @@
       }
       break;
     case PARTITION_VERT_4:
-      for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
+      for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+        int this_mi_col = mi_col + i * qbs_w;
         if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
         encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
                  partition, pc_tree->vertical4[i], rate);
       }
       break;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     default: assert(0 && "Invalid partition type."); break;
   }
 
+  if (ptree) ptree->is_settled = 1;
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static void build_one_split_tree(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                 BLOCK_SIZE bsize, BLOCK_SIZE final_bsize,
+                                 PARTITION_TREE *ptree) {
+  assert(block_size_high[bsize] == block_size_wide[bsize]);
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+  if (bsize == BLOCK_4X4 || bsize == final_bsize) {
+    ptree->partition = PARTITION_NONE;
+    return;
+  }
+
+  const int hbs_w = mi_size_wide[bsize] >> 1;
+  const int hbs_h = mi_size_high[bsize] >> 1;
+  const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+
+  ptree->partition = PARTITION_HORZ;
+  ptree->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+  ptree->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+
+  ptree->sub_tree[0]->partition = PARTITION_VERT;
+  ptree->sub_tree[0]->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+  ptree->sub_tree[0]->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+
+  ptree->sub_tree[1]->partition = PARTITION_VERT;
+  ptree->sub_tree[1]->sub_tree[0] = av1_alloc_ptree_node(ptree, 0);
+  ptree->sub_tree[1]->sub_tree[1] = av1_alloc_ptree_node(ptree, 1);
+
+  build_one_split_tree(cm, mi_row, mi_col, subsize, final_bsize,
+                       ptree->sub_tree[0]->sub_tree[0]);
+  build_one_split_tree(cm, mi_row, mi_col + hbs_w, subsize, final_bsize,
+                       ptree->sub_tree[0]->sub_tree[1]);
+  build_one_split_tree(cm, mi_row + hbs_h, mi_col, subsize, final_bsize,
+                       ptree->sub_tree[1]->sub_tree[0]);
+  build_one_split_tree(cm, mi_row + hbs_h, mi_col + hbs_w, subsize, final_bsize,
+                       ptree->sub_tree[1]->sub_tree[1]);
+}
+
+void av1_build_partition_tree_fixed_partitioning(AV1_COMMON *const cm,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize,
+                                                 PARTITION_TREE *ptree) {
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  build_one_split_tree(cm, mi_row, mi_col, sb_size, bsize, ptree);
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
+static PARTITION_TYPE get_preset_partition(const AV1_COMMON *cm,
+#if CONFIG_SDP
+                                           int plane_type,
+#endif  // CONFIG_SDP
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize,
+                                           PARTITION_TREE *ptree) {
+  if (ptree) return ptree->partition;
+  if (bsize >= BLOCK_8X8) {
+#if CONFIG_SDP
+    return get_partition(cm, plane_type, mi_row, mi_col, bsize);
+#else
+    return get_partition(cm, mi_row, mi_col, bsize);
+#endif  // CONFIG_SDP
+  }
+  return PARTITION_NONE;
+}
+
 /*!\brief AV1 block partition search (partition estimation and partial search).
 *
 * \ingroup partition_search
@@ -1795,7 +2124,8 @@
 blocks starting from the first pixel of the current
 block
 * \param[in]    tp        Pointer to the starting token
-* \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+* \param[in]    mi_row    Row coordinate of the block in a step size of
+MI_SIZE
 * \param[in]    mi_col    Column coordinate of the block in a step size of
 MI_SIZE
 * \param[in]    bsize     Current block size
@@ -1805,6 +2135,8 @@
 * \param[in]    do_recon  Whether the reconstruction function needs to be run,
 either for finalizing a superblock or providing
 reference for future sub-partitions
+* \param[in]    ptree     Pointer to the PARTITION_TREE node holding the
+pre-calculated partition tree (if any) for the current block
 * \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
 partitions and mode info for the current block
 *
@@ -1815,42 +2147,50 @@
 void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
                           MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
                           int mi_col, BLOCK_SIZE bsize, int *rate,
-                          int64_t *dist, int do_recon, PC_TREE *pc_tree) {
+                          int64_t *dist, int do_recon, PARTITION_TREE *ptree,
+                          PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
   const ModeCosts *mode_costs = &x->mode_costs;
   const int bs = mi_size_wide[bsize];
   const int hbs = bs / 2;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int hbh = mi_size_high[bsize] / 2;
+  const int hbw = mi_size_wide[bsize] / 2;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   const int pl = (bsize >= BLOCK_8X8)
                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
                      : 0;
-  const PARTITION_TYPE partition =
 #if CONFIG_SDP
-      (bsize >= BLOCK_8X8) ? get_partition(cm, xd->tree_type == CHROMA_PART,
-                                           mi_row, mi_col, bsize)
+  const int plane_type = (xd->tree_type == CHROMA_PART);
+  const PARTITION_TYPE partition =
+      get_preset_partition(cm, plane_type, mi_row, mi_col, bsize, ptree);
 #else
-      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
-#endif
-                           : PARTITION_NONE;
+  const PARTITION_TYPE partition =
+      get_preset_partition(cm, mi_row, mi_col, bsize, ptree);
+#endif  // CONFIG_SDP
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   RD_STATS last_part_rdc, invalid_rdc;
-#if CONFIG_SDP
-  int plane_type = (xd->tree_type == CHROMA_PART);
-#endif
 
   if (pc_tree->none == NULL) {
-    pc_tree->none = av1_alloc_pmc(cm, bsize, &td->shared_coeff_buf);
+    pc_tree->none =
+        av1_alloc_pmc(cm, mi_row, mi_col, bsize, pc_tree, PARTITION_NONE, 0,
+                      ss_x, ss_y, &td->shared_coeff_buf);
   }
   PICK_MODE_CONTEXT *ctx_none = pc_tree->none;
 
   if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
   av1_invalid_rd_stats(&last_part_rdc);
   av1_invalid_rd_stats(&invalid_rdc);
@@ -1864,7 +2204,8 @@
   av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
   if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
-    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize,
+                    &pc_tree->chroma_ref_info);
     x->mb_energy = av1_log_block_var(cpi, x, bsize);
   }
 
@@ -1872,34 +2213,61 @@
   const int orig_rdmult = x->rdmult;
   setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE split_subsize =
+      get_partition_subsize(bsize, PARTITION_SPLIT);
   for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
-    pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
-    pc_tree->split[i]->index = i;
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    pc_tree->split[i] =
+        av1_alloc_pc_tree_node(mi_row + y_idx, mi_col + x_idx, split_subsize,
+                               pc_tree, PARTITION_SPLIT, i, i == 3, ss_x, ss_y);
   }
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   switch (partition) {
     case PARTITION_NONE:
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                     PARTITION_NONE, bsize, ctx_none, invalid_rdc);
       break;
     case PARTITION_HORZ:
+#if CONFIG_EXT_RECUR_PARTITIONS
+      pc_tree->horizontal[0] = av1_alloc_pc_tree_node(
+          mi_row, mi_col, subsize, pc_tree, PARTITION_HORZ, 0, 0, ss_x, ss_y);
+      pc_tree->horizontal[1] =
+          av1_alloc_pc_tree_node(mi_row + hbh, mi_col, subsize, pc_tree,
+                                 PARTITION_HORZ, 1, 1, ss_x, ss_y);
+      av1_rd_use_partition(cpi, td, tile_data, mib, tp, mi_row, mi_col, subsize,
+                           &last_part_rdc.rate, &last_part_rdc.dist, 1,
+                           ptree ? ptree->sub_tree[0] : NULL,
+                           pc_tree->horizontal[0]);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
       for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
         pc_tree->horizontal[i] =
-            av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+            av1_alloc_pmc(cm, mi_row + hbs * i, mi_col, subsize, pc_tree,
+                          PARTITION_HORZ, i, ss_x, ss_y, &td->shared_coeff_buf);
       }
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                     PARTITION_HORZ, subsize, pc_tree->horizontal[0],
                     invalid_rdc);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + hbs < mi_params->mi_rows) {
         RD_STATS tmp_rdc;
-        const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0];
         av1_init_rd_stats(&tmp_rdc);
+#if CONFIG_EXT_RECUR_PARTITIONS
+        av1_rd_use_partition(
+            cpi, td, tile_data, mib + hbh * mi_params->mi_stride, tp,
+            mi_row + hbh, mi_col, subsize, &tmp_rdc.rate, &tmp_rdc.dist, 0,
+            ptree ? ptree->sub_tree[1] : NULL, pc_tree->horizontal[1]);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+        const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0];
         av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
                           NULL);
         pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
                       PARTITION_HORZ, subsize, pc_tree->horizontal[1],
                       invalid_rdc);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1910,23 +2278,43 @@
       }
       break;
     case PARTITION_VERT:
+#if CONFIG_EXT_RECUR_PARTITIONS
+      pc_tree->vertical[0] = av1_alloc_pc_tree_node(
+          mi_row, mi_col, subsize, pc_tree, PARTITION_VERT, 0, 0, ss_x, ss_y);
+      pc_tree->vertical[1] =
+          av1_alloc_pc_tree_node(mi_row, mi_col + hbw, subsize, pc_tree,
+                                 PARTITION_VERT, 1, 1, ss_x, ss_y);
+      av1_rd_use_partition(cpi, td, tile_data, mib, tp, mi_row, mi_col, subsize,
+                           &last_part_rdc.rate, &last_part_rdc.dist, 1,
+                           ptree ? ptree->sub_tree[0] : NULL,
+                           pc_tree->vertical[0]);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
       for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
         pc_tree->vertical[i] =
-            av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+            av1_alloc_pmc(cm, mi_row, mi_col + hbs * i, subsize, pc_tree,
+                          PARTITION_VERT, i, ss_x, ss_y, &td->shared_coeff_buf);
       }
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                     PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + hbs < mi_params->mi_cols) {
         RD_STATS tmp_rdc;
-        const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0];
         av1_init_rd_stats(&tmp_rdc);
+#if CONFIG_EXT_RECUR_PARTITIONS
+        av1_rd_use_partition(
+            cpi, td, tile_data, mib + hbw, tp, mi_row, mi_col + hbw, subsize,
+            &tmp_rdc.rate, &tmp_rdc.dist, 0, ptree ? ptree->sub_tree[1] : NULL,
+            pc_tree->vertical[1]);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+        const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0];
         av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
                           NULL);
         pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
                       PARTITION_VERT, subsize,
                       pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1936,6 +2324,10 @@
         last_part_rdc.rdcost += tmp_rdc.rdcost;
       }
       break;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_3:
+    case PARTITION_VERT_3:
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     case PARTITION_SPLIT:
       last_part_rdc.rate = 0;
       last_part_rdc.dist = 0;
@@ -1950,11 +2342,12 @@
           continue;
 
         av1_init_rd_stats(&tmp_rdc);
-        av1_rd_use_partition(
-            cpi, td, tile_data,
-            mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
-            mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
-            &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]);
+        av1_rd_use_partition(cpi, td, tile_data,
+                             mib + jj * hbs * mi_params->mi_stride + ii * hbs,
+                             tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                             &tmp_rdc.rate, &tmp_rdc.dist,
+                             i != (SUB_PARTITIONS_SPLIT - 1), NULL,
+                             pc_tree->split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1969,6 +2362,7 @@
     case PARTITION_HORZ_B:
     case PARTITION_HORZ_4:
     case PARTITION_VERT_4:
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
       assert(0 && "Cannot handle extended partition types");
     default: assert(0); break;
   }
@@ -1992,7 +2386,7 @@
 #endif
   if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
 
-  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  av1_restore_context(cm, x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
@@ -2006,15 +2400,31 @@
       // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
       //           bsize, pc_tree, &rate_coeffs);
 #if CONFIG_SDP
-      x->cb_offset[plane_type] = 0;
-#else
-      x->cb_offset = 0;
-#endif
+      const int plane_start = (xd->tree_type == CHROMA_PART);
+      const int plane_end = (xd->tree_type == LUMA_PART) ? 1 : num_planes;
+      for (int plane = plane_start; plane < plane_end; plane++) {
+        x->cb_offset[plane] = 0;
+      }
+      av1_reset_ptree_in_sbi(xd->sbi, xd->tree_type);
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
-                pc_tree, NULL);
+                pc_tree, xd->sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)],
+#if CONFIG_EXT_RECUR_PARTITIONS
+                NULL,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                NULL);
+#else
+      memset(x->cb_offset, 0, sizeof(x->cb_offset));
+      av1_reset_ptree_in_sbi(xd->sbi);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, xd->sbi->ptree_root, NULL);
+#endif  // CONFIG_SDP
     } else {
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-                pc_tree, NULL);
+                pc_tree, NULL,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                NULL,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                NULL);
     }
   }
 
@@ -2023,6 +2433,7 @@
   x->rdmult = orig_rdmult;
 }
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
 // Try searching for an encoding for the given subblock. Returns zero if the
 // rdcost is already too high (to tell the caller not to bother searching for
 // encodings of further subblocks).
@@ -2106,29 +2517,71 @@
   pc_tree->partitioning = partition;
   return true;
 }
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+static INLINE int check_is_chroma_size_valid(PARTITION_TYPE partition,
+                                             BLOCK_SIZE bsize, int mi_row,
+                                             int mi_col, int ss_x, int ss_y,
+                                             const PC_TREE *pc_tree) {
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  int is_valid = 0;
+  if (subsize < BLOCK_SIZES_ALL) {
+    CHROMA_REF_INFO tmp_chr_ref_info = {
+      1, 0, mi_row, mi_col, subsize, subsize
+    };
+    set_chroma_ref_info(mi_row, mi_col, 0, subsize, &tmp_chr_ref_info,
+                        &pc_tree->chroma_ref_info, bsize, partition, ss_x,
+                        ss_y);
+    is_valid = get_plane_block_size(tmp_chr_ref_info.bsize_base, ss_x, ss_y) !=
+               BLOCK_INVALID;
+  }
+  return is_valid;
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
 // Initialize state variables of partition search used in
 // av1_rd_pick_partition().
 static void init_partition_search_state_params(
     MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state,
+#if CONFIG_EXT_RECUR_PARTITIONS
+    PC_TREE *pc_tree,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     int mi_row, int mi_col, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const AV1_COMMON *const cm = &cpi->common;
   PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
   const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
 
+  assert(bsize < BLOCK_SIZES_ALL);
+
   // Initialization of block size related parameters.
   blk_params->mi_step = mi_size_wide[bsize] / 2;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  blk_params->mi_step_h = mi_size_high[bsize] / 2;
+  blk_params->mi_step_w = mi_size_wide[bsize] / 2;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   blk_params->mi_row = mi_row;
   blk_params->mi_col = mi_col;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  blk_params->mi_row_edge = mi_row + blk_params->mi_step_h;
+  blk_params->mi_col_edge = mi_col + blk_params->mi_step_w;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
   blk_params->mi_row_edge = mi_row + blk_params->mi_step;
   blk_params->mi_col_edge = mi_col + blk_params->mi_step;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   blk_params->width = block_size_wide[bsize];
+#if CONFIG_EXT_RECUR_PARTITIONS
+  blk_params->min_partition_size = x->sb_enc.min_partition_size;
+#else
   blk_params->min_partition_size_1d =
       block_size_wide[x->sb_enc.min_partition_size];
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
   blk_params->split_bsize2 = blk_params->subsize;
+#if !CONFIG_EXT_RECUR_PARTITIONS
   blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   blk_params->bsize = bsize;
 
   // Check if the partition corresponds to edge block.
@@ -2145,17 +2598,42 @@
 
   // Set partition plane context index.
   part_search_state->pl_ctx_idx =
+#if CONFIG_EXT_RECUR_PARTITIONS
+      is_partition_point(bsize)
+#else
       blk_params->bsize_at_least_8x8
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
           ? partition_plane_context(xd, mi_row, mi_col, bsize)
           : 0;
 
   // Partition cost buffer update
   ModeCosts *mode_costs = &x->mode_costs;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int pl = part_search_state->pl_ctx_idx;
+  if (is_square_block(bsize)) {
 #if CONFIG_SDP
+    part_search_state->partition_cost =
+        mode_costs->partition_cost[xd->tree_type == CHROMA_PART][pl];
+#else
+    part_search_state->partition_cost = mode_costs->partition_cost[pl];
+#endif  // CONFIG_SDP
+  } else {
+    for (PARTITION_TYPE p = PARTITION_NONE; p < EXT_PARTITION_TYPES; ++p) {
+      PARTITION_TYPE_REC p_rec = get_symbol_from_partition_rec_block(bsize, p);
+
+      if (p_rec != PARTITION_INVALID_REC)
+        part_search_state->partition_cost_table[p] =
+            mode_costs->partition_rec_cost[pl][p_rec];
+      else
+        part_search_state->partition_cost_table[p] = INT_MAX;
+    }
+    part_search_state->partition_cost = part_search_state->partition_cost_table;
+  }
+#elif CONFIG_SDP
   part_search_state->partition_cost =
       mode_costs->partition_cost[xd->tree_type == CHROMA_PART]
                                 [part_search_state->pl_ctx_idx];
-#else
+#else  // !CONFIG_EXT_RECUR_PARTITIONS && !CONFIG_SDP
   part_search_state->partition_cost =
       mode_costs->partition_cost[part_search_state->pl_ctx_idx];
 #endif
@@ -2185,55 +2663,144 @@
 
   // Initialize partition search flags to defaults.
   part_search_state->terminate_partition_search = 0;
+
+  av1_zero(part_search_state->prune_rect_part);
+
 #if CONFIG_SDP
+#if !CONFIG_EXT_RECUR_PARTITIONS
   part_search_state->do_square_split =
       blk_params->bsize_at_least_8x8 &&
       (xd->tree_type != CHROMA_PART || bsize > BLOCK_8X8);
   part_search_state->do_rectangular_split =
       cpi->oxcf.part_cfg.enable_rect_partitions &&
       (xd->tree_type != CHROMA_PART || bsize > BLOCK_8X8);
+#else
+  part_search_state->do_rectangular_split =
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      (xd->tree_type != CHROMA_PART || is_bsize_gt(bsize, BLOCK_8X8));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
-  av1_zero(part_search_state->prune_rect_part);
+  const BLOCK_SIZE horz_subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+  const BLOCK_SIZE vert_subsize = get_partition_subsize(bsize, PARTITION_VERT);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  // TODO(chiyotsai,yuec@google.com): Fix the rect_allowed condition when both
+  // SDP and ERP are on.
+  const int is_horz_size_valid =
+      is_partition_valid(bsize, PARTITION_HORZ) &&
+      IMPLIES(xd->tree_type == SHARED_PART,
+              check_is_chroma_size_valid(PARTITION_HORZ, bsize, mi_row, mi_col,
+                                         part_search_state->ss_x,
+                                         part_search_state->ss_y, pc_tree));
+
+  const int is_vert_size_valid =
+      is_partition_valid(bsize, PARTITION_VERT) &&
+      IMPLIES(xd->tree_type == SHARED_PART,
+              check_is_chroma_size_valid(PARTITION_VERT, bsize, mi_row, mi_col,
+                                         part_search_state->ss_x,
+                                         part_search_state->ss_y, pc_tree));
+#else
+  const int is_horz_size_valid =
+      horz_subsize != BLOCK_INVALID &&
+      get_plane_block_size(horz_subsize, part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  const int is_vert_size_valid =
+      vert_subsize != BLOCK_INVALID &&
+      get_plane_block_size(vert_subsize, part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+#if !CONFIG_EXT_RECUR_PARTITIONS
+  const bool no_sub_16_chroma_part =
+      xd->tree_type != CHROMA_PART ||
+      (block_size_wide[bsize] > 8 && block_size_high[bsize] > 8);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
   // Initialize allowed partition types for the partition block.
+  part_search_state->is_block_splittable = is_partition_point(bsize);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  part_search_state->partition_none_allowed =
+      (xd->tree_type == CHROMA_PART && bsize == BLOCK_8X8) ||
+      (blk_params->has_rows && blk_params->has_cols &&
+       is_bsize_geq(blk_params->bsize, blk_params->min_partition_size));
+#else
   part_search_state->partition_none_allowed =
       blk_params->has_rows && blk_params->has_cols;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   part_search_state->partition_rect_allowed[HORZ] =
-      blk_params->has_cols && blk_params->bsize_at_least_8x8 &&
-      cpi->oxcf.part_cfg.enable_rect_partitions &&
-      (xd->tree_type != CHROMA_PART || bsize > BLOCK_8X8) &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ),
-                           part_search_state->ss_x,
-                           part_search_state->ss_y) != BLOCK_INVALID;
-  part_search_state->partition_rect_allowed[VERT] =
-      blk_params->has_rows && blk_params->bsize_at_least_8x8 &&
-      (xd->tree_type != CHROMA_PART || bsize > BLOCK_8X8) &&
-      cpi->oxcf.part_cfg.enable_rect_partitions &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT),
-                           part_search_state->ss_x,
-                           part_search_state->ss_y) != BLOCK_INVALID;
+#if CONFIG_EXT_RECUR_PARTITIONS
+      (blk_params->has_cols || !blk_params->has_rows) &&
 #else
+      blk_params->has_cols &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+#if !CONFIG_EXT_RECUR_PARTITIONS
+      blk_params->bsize_at_least_8x8 && no_sub_16_chroma_part &&
+#endif  //  !CONFIG_EXT_RECUR_PARTITIONS
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+      is_bsize_geq(horz_subsize, blk_params->min_partition_size) &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      is_horz_size_valid;
+  part_search_state->partition_rect_allowed[VERT] =
+#if CONFIG_EXT_RECUR_PARTITIONS
+      (blk_params->has_rows || !blk_params->has_cols) &&
+#else
+      blk_params->has_rows &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+#if !CONFIG_EXT_RECUR_PARTITIONS
+      blk_params->bsize_at_least_8x8 && no_sub_16_chroma_part &&
+#endif  //  !CONFIG_EXT_RECUR_PARTITIONS
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+      is_bsize_geq(vert_subsize, blk_params->min_partition_size) &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      is_vert_size_valid;
+#else  // !CONFIG_SDP
+#if !CONFIG_EXT_RECUR_PARTITIONS
   part_search_state->do_square_split = blk_params->bsize_at_least_8x8;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   part_search_state->do_rectangular_split =
       cpi->oxcf.part_cfg.enable_rect_partitions;
-  av1_zero(part_search_state->prune_rect_part);
 
   // Initialize allowed partition types for the partition block.
+#if CONFIG_EXT_RECUR_PARTITIONS
+  int is_chroma_size_valid_horz = check_is_chroma_size_valid(
+      PARTITION_HORZ, bsize, mi_row, mi_col, part_search_state->ss_x,
+      part_search_state->ss_y, pc_tree);
+
+  int is_chroma_size_valid_vert = check_is_chroma_size_valid(
+      PARTITION_VERT, bsize, mi_row, mi_col, part_search_state->ss_x,
+      part_search_state->ss_y, pc_tree);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
+  part_search_state->is_block_splittable = is_partition_point(bsize);
   part_search_state->partition_none_allowed =
       blk_params->has_rows && blk_params->has_cols;
   part_search_state->partition_rect_allowed[HORZ] =
-      blk_params->has_cols && blk_params->bsize_at_least_8x8 &&
-      cpi->oxcf.part_cfg.enable_rect_partitions &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+      (blk_params->has_cols || !blk_params->has_rows) &&
+      is_partition_valid(bsize, PARTITION_HORZ) && is_chroma_size_valid_horz &&
+      is_bsize_geq(get_partition_subsize(bsize, PARTITION_HORZ),
+                   blk_params->min_partition_size) &&
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+      blk_params->has_cols && is_partition_valid(bsize, PARTITION_HORZ) &&
       get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ),
                            part_search_state->ss_x,
-                           part_search_state->ss_y) != BLOCK_INVALID;
+                           part_search_state->ss_y) != BLOCK_INVALID &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      cpi->oxcf.part_cfg.enable_rect_partitions;
   part_search_state->partition_rect_allowed[VERT] =
-      blk_params->has_rows && blk_params->bsize_at_least_8x8 &&
-      cpi->oxcf.part_cfg.enable_rect_partitions &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+      (blk_params->has_rows || !blk_params->has_cols) &&
+      is_partition_valid(bsize, PARTITION_VERT) && is_chroma_size_valid_vert &&
+      is_bsize_geq(get_partition_subsize(bsize, PARTITION_VERT),
+                   blk_params->min_partition_size) &&
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+      blk_params->has_rows && is_partition_valid(bsize, PARTITION_VERT) &&
       get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT),
                            part_search_state->ss_x,
-                           part_search_state->ss_y) != BLOCK_INVALID;
-#endif
+                           part_search_state->ss_y) != BLOCK_INVALID &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      cpi->oxcf.part_cfg.enable_rect_partitions;
+#endif  // CONFIG_SDP
 
   // Reset the flag indicating whether a partition leading to a rdcost lower
   // than the bound best_rdc has been found.
@@ -2247,8 +2814,29 @@
     PartitionSearchState *part_search_state) {
 #else
     AV1_COMMON const *cm, PartitionSearchState *part_search_state) {
-#endif
+#endif  // CONFIG_SDP
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int has_rows = blk_params.has_rows;
+  const int has_cols = blk_params.has_cols;
+  (void)cm;
+  if (!(has_rows && has_cols)) {
+    if (!has_rows && !has_cols) {
+      // At the bottom right, horz or vert
+      aom_cdf_prob binary_cdf[2] = { 16384, AOM_ICDF(CDF_PROB_TOP) };
+      static const int binary_inv_map[2] = { PARTITION_HORZ, PARTITION_VERT };
+      av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost,
+                               binary_cdf, binary_inv_map);
+    } else {
+      for (int i = 0; i < PARTITION_TYPES; ++i)
+        part_search_state->tmp_partition_cost[i] = 0;
+    }
+    part_search_state->partition_cost = part_search_state->tmp_partition_cost;
+  }
+#if CONFIG_SDP
+  (void)xd;
+#endif  // CONFIG_SDP
+#else   // CONFIG_EXT_RECUR_PARTITIONS
   assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0);
 #if CONFIG_SDP
   const int plane = xd->tree_type == CHROMA_PART;
@@ -2281,36 +2869,108 @@
   }
   // Override the partition cost buffer.
   part_search_state->partition_cost = part_search_state->tmp_partition_cost;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 // Reset the partition search state flags when
 // must_find_valid_partition is equal to 1.
 static AOM_INLINE void reset_part_limitations(
-    AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
+    AV1_COMP *const cpi,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+    TREE_TYPE tree_type,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_EXT_RECUR_PARTITIONS
+    PC_TREE *pc_tree,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    PartitionSearchState *part_search_state) {
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
-  const int is_rect_part_allowed =
-      blk_params.bsize_at_least_8x8 &&
-      cpi->oxcf.part_cfg.enable_rect_partitions &&
-      (blk_params.width > blk_params.min_partition_size_1d);
+#if !CONFIG_EXT_RECUR_PARTITIONS
   part_search_state->do_square_split =
       blk_params.bsize_at_least_8x8 &&
       (blk_params.width > blk_params.min_partition_size_1d);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  part_search_state->partition_none_allowed =
+      (tree_type == CHROMA_PART && blk_params.bsize == BLOCK_8X8) ||
+      (blk_params.has_rows && blk_params.has_cols &&
+       is_bsize_geq(blk_params.bsize, blk_params.min_partition_size));
+#else
   part_search_state->partition_none_allowed =
       blk_params.has_rows && blk_params.has_cols &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+      is_bsize_geq(blk_params.bsize, blk_params.min_partition_size);
+#else
       (blk_params.width >= blk_params.min_partition_size_1d);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+
+  // Initialize allowed partition types for the partition block.
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE horz_subsize =
+      get_partition_subsize(blk_params.bsize, PARTITION_HORZ);
+  const BLOCK_SIZE vert_subsize =
+      get_partition_subsize(blk_params.bsize, PARTITION_VERT);
+  const int is_chroma_size_valid_horz = check_is_chroma_size_valid(
+      PARTITION_HORZ, blk_params.bsize, blk_params.mi_row, blk_params.mi_col,
+      part_search_state->ss_x, part_search_state->ss_y, pc_tree);
+  const int is_chroma_size_valid_vert = check_is_chroma_size_valid(
+      PARTITION_VERT, blk_params.bsize, blk_params.mi_row, blk_params.mi_col,
+      part_search_state->ss_x, part_search_state->ss_y, pc_tree);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   part_search_state->partition_rect_allowed[HORZ] =
-      blk_params.has_cols && is_rect_part_allowed &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+      (blk_params.has_cols || !blk_params.has_rows) &&
+      is_partition_valid(blk_params.bsize, PARTITION_HORZ) &&
+      is_chroma_size_valid_horz &&
+      is_bsize_geq(horz_subsize, blk_params.min_partition_size) &&
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+      blk_params.has_cols &&
+      is_partition_valid(blk_params.bsize, PARTITION_HORZ) &&
       get_plane_block_size(
           get_partition_subsize(blk_params.bsize, PARTITION_HORZ),
-          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID &&
+      (blk_params.width > blk_params.min_partition_size_1d) &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      cpi->oxcf.part_cfg.enable_rect_partitions;
   part_search_state->partition_rect_allowed[VERT] =
-      blk_params.has_rows && is_rect_part_allowed &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+      (blk_params.has_rows || !blk_params.has_cols) &&
+      is_partition_valid(blk_params.bsize, PARTITION_VERT) &&
+      is_chroma_size_valid_vert &&
+      is_bsize_geq(vert_subsize, blk_params.min_partition_size) &&
+#else   // CONFIG_EXT_RECUR_PARTITIONS
+      blk_params.has_rows &&
+      is_partition_valid(blk_params.bsize, PARTITION_VERT) &&
       get_plane_block_size(
           get_partition_subsize(blk_params.bsize, PARTITION_VERT),
-          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID &&
+      (blk_params.width > blk_params.min_partition_size_1d) &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+      cpi->oxcf.part_cfg.enable_rect_partitions;
   part_search_state->terminate_partition_search = 0;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (!is_square_block(blk_params.bsize)) {
+    if (!part_search_state->partition_rect_allowed[HORZ] &&
+        !part_search_state->partition_rect_allowed[VERT] &&
+        !part_search_state->partition_none_allowed) {
+      if (block_size_wide[blk_params.bsize] >
+          block_size_high[blk_params.bsize]) {
+        if (is_bsize_geq(vert_subsize, blk_params.min_partition_size)) {
+          part_search_state->partition_rect_allowed[VERT] = 1;
+        }
+      } else {
+        if (is_bsize_geq(horz_subsize, blk_params.min_partition_size)) {
+          part_search_state->partition_rect_allowed[HORZ] = 1;
+        }
+      }
+    }
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
+static const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ,
+                                                         PARTITION_VERT };
+#if !CONFIG_EXT_RECUR_PARTITIONS
 // Rectangular partitions evaluation at sub-block level.
 static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data,
                                    MACROBLOCK *x,
@@ -2343,9 +3003,86 @@
   part_search_state->rect_part_rd[rect_part][idx] =
       part_search_state->this_rdc.rdcost;
 }
+#else
+static void rd_pick_rect_partition(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    RECT_PART_TYPE rect_type,
+    const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2],
+    BLOCK_SIZE bsize, const int is_not_edge_block[NUM_RECT_PARTS],
+    SB_MULTI_PASS_MODE multi_pass_mode
+#if CONFIG_SDP
+    ,
+    const PARTITION_TREE *ptree_luma, const PARTITION_TREE *template_tree
+#endif  // CONFIG_SDP
+) {
+  const PARTITION_TYPE partition_type = rect_partition_type[rect_type];
+  RD_STATS *sum_rdc = &part_search_state->sum_rdc;
+
+  sum_rdc->rate = part_search_state->partition_cost[partition_type];
+  sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0);
+
+  RD_STATS this_rdc;
+  RD_STATS best_remain_rdcost;
+  PC_TREE **sub_tree =
+      (rect_type == HORZ) ? pc_tree->horizontal : pc_tree->vertical;
+#if CONFIG_SDP
+  const int track_ptree_luma =
+      ptree_luma && ptree_luma->partition == partition_type;
+#endif  // CONFIG_SDP
+  av1_rd_stats_subtraction(x->rdmult, best_rdc, sum_rdc, &best_remain_rdcost);
+  bool partition_found = av1_rd_pick_partition(
+      cpi, td, tile_data, tp, mi_pos_rect[rect_type][0][0],
+      mi_pos_rect[rect_type][0][1], bsize, &this_rdc, best_remain_rdcost,
+      sub_tree[0],
+#if CONFIG_SDP
+      track_ptree_luma ? ptree_luma->sub_tree[0] : NULL,
+      get_partition_subtree_const(template_tree, 0),
+#endif  // CONFIG_SDP
+      NULL, NULL, multi_pass_mode, NULL);
+  av1_rd_cost_update(x->rdmult, &this_rdc);
+  if (!partition_found) {
+    av1_invalid_rd_stats(sum_rdc);
+    return;
+  } else {
+    sum_rdc->rate += this_rdc.rate;
+    sum_rdc->dist += this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, sum_rdc);
+  }
+  part_search_state->rect_part_rd[rect_type][0] = this_rdc.rdcost;
+
+  if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[rect_type]) {
+    av1_rd_stats_subtraction(x->rdmult, best_rdc, sum_rdc, &best_remain_rdcost);
+    partition_found = av1_rd_pick_partition(
+        cpi, td, tile_data, tp, mi_pos_rect[rect_type][1][0],
+        mi_pos_rect[rect_type][1][1], bsize, &this_rdc, best_remain_rdcost,
+        sub_tree[1],
+#if CONFIG_SDP
+        track_ptree_luma ? ptree_luma->sub_tree[1] : NULL,
+        get_partition_subtree_const(template_tree, 1),
+#endif  // CONFIG_SDP
+        NULL, NULL, multi_pass_mode, NULL);
+    av1_rd_cost_update(x->rdmult, &this_rdc);
+    part_search_state->rect_part_rd[rect_type][1] = this_rdc.rdcost;
+
+    if (!partition_found) {
+      av1_invalid_rd_stats(sum_rdc);
+      return;
+    } else {
+      sum_rdc->rate += this_rdc.rate;
+      sum_rdc->dist += this_rdc.dist;
+      av1_rd_cost_update(x->rdmult, sum_rdc);
+    }
+  }
+}
+#endif
 
 typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step);
 
+#define IS_FORCED_PARTITION_TYPE(cur_partition) \
+  (forced_partition == PARTITION_INVALID || forced_partition == (cur_partition))
+
 // Checks if HORZ / VERT partition search is allowed.
 static AOM_INLINE int is_rect_part_allowed(
     const AV1_COMP *cpi, PartitionSearchState *part_search_state,
@@ -2355,23 +3092,75 @@
       (!part_search_state->terminate_partition_search &&
        part_search_state->partition_rect_allowed[rect_part] &&
        !part_search_state->prune_rect_part[rect_part] &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+       is_partition_valid(blk_params.bsize, rect_partition_type[rect_part]) &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
        (part_search_state->do_rectangular_split ||
         active_edge[rect_part](cpi, mi_pos, blk_params.mi_step)));
   return is_part_allowed;
 }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static INLINE int is_bsize_pruning_cand(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_INVALID) {
+    return 0;
+  }
+
+  const int avg_bsize = (block_size_wide[bsize] + block_size_high[bsize]) / 2;
+  return avg_bsize <= 32;
+}
+
+static AOM_INLINE PARTITION_TYPE get_forced_partition_type(
+    const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row, int mi_col,
+    BLOCK_SIZE bsize
+#if CONFIG_SDP
+    ,
+    const PARTITION_TREE *template_tree
+#endif  // CONFIG_SDP
+) {
+#if CONFIG_SDP
+  if (template_tree) {
+    return template_tree->partition;
+  }
+#endif  // CONFIG_SDP
+
+  if (should_reuse_mode(x, REUSE_PARTITION_MODE_FLAG)) {
+    return av1_get_prev_partition(x, mi_row, mi_col, bsize,
+                                  cm->seq_params.sb_size);
+  }
+  return PARTITION_INVALID;
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 // Rectangular partition types search function.
 static void rectangular_partition_search(
     AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
     TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
     RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
     PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+#if CONFIG_EXT_RECUR_PARTITIONS
+    SB_MULTI_PASS_MODE multi_pass_mode,
+#if CONFIG_SDP
+    const PARTITION_TREE *ptree_luma, const PARTITION_TREE *template_tree,
+#endif  // CONFIG_SDP
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     RD_RECT_PART_WIN_INFO *rect_part_win_info) {
   const AV1_COMMON *const cm = &cpi->common;
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
+#if CONFIG_SDP
+  const PARTITION_TYPE forced_partition =
+      get_forced_partition_type(cm, x, blk_params.mi_row, blk_params.mi_col,
+                                blk_params.bsize, template_tree);
+#else
+  const PARTITION_TYPE forced_partition = get_forced_partition_type(
+      cm, x, blk_params.mi_row, blk_params.mi_col, blk_params.bsize);
+#endif  // CONFIG_SDP
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   RD_STATS *sum_rdc = &part_search_state->sum_rdc;
-  const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ,
-                                                    PARTITION_VERT };
 
   // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of
   //                                           HORZ and VERT partition types.
@@ -2392,12 +3181,14 @@
   // Indicates edge blocks for HORZ and VERT partition types.
   const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows,
                                                   blk_params.has_cols };
+#if !CONFIG_EXT_RECUR_PARTITIONS
 
   // Initialize pc tree context for HORZ and VERT partition types.
   PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = {
     { &pc_tree->horizontal[0], &pc_tree->horizontal[1] },
     { &pc_tree->vertical[0], &pc_tree->vertical[1] }
   };
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
   // Loop over rectangular partition types.
   for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) {
@@ -2410,16 +3201,67 @@
       continue;
 
     // Sub-partition idx.
-    int sub_part_idx = 0;
-    PARTITION_TYPE partition_type = rect_partition_type[i];
+    const PARTITION_TYPE partition_type = rect_partition_type[i];
     blk_params.subsize =
         get_partition_subsize(blk_params.bsize, partition_type);
+    const int part_hv_rate = part_search_state->partition_cost[partition_type];
+    if (part_hv_rate == INT_MAX ||
+        RDCOST(x->rdmult, part_hv_rate, 0) >= best_rdc->rdcost) {
+      continue;
+    }
+#if !CONFIG_EXT_RECUR_PARTITIONS
     assert(blk_params.subsize <= BLOCK_LARGEST);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
     av1_init_rd_stats(sum_rdc);
+#if CONFIG_EXT_RECUR_PARTITIONS
+    if (!IS_FORCED_PARTITION_TYPE(partition_type)) {
+      continue;
+    }
+
+    if (cpi->sf.part_sf.enable_fast_erp && !frame_is_intra_only(cm) &&
+        !x->must_find_valid_partition &&
+        is_bsize_pruning_cand(blk_params.bsize)) {
+      if (av1_prune_part_hv_with_sms(cpi, tile_data, x, part_search_state,
+                                     best_rdc, &blk_params, i, part_hv_rate)) {
+        continue;
+      }
+    }
+
+    PC_TREE **sub_tree = (i == HORZ) ? pc_tree->horizontal : pc_tree->vertical;
+
+    const int num_planes = av1_num_planes(cm);
+    for (int idx = 0; idx < SUB_PARTITIONS_RECT; idx++) {
+      if (sub_tree[idx]) {
+        av1_free_pc_tree_recursive(sub_tree[idx], num_planes, 0, 0);
+        sub_tree[idx] = NULL;
+      }
+    }
+    sub_tree[0] = av1_alloc_pc_tree_node(
+        mi_pos_rect[i][0][0], mi_pos_rect[i][0][1], blk_params.subsize, pc_tree,
+        partition_type, 0, 0, ss_x, ss_y);
+    sub_tree[1] = av1_alloc_pc_tree_node(
+        mi_pos_rect[i][1][0], mi_pos_rect[i][1][1], blk_params.subsize, pc_tree,
+        partition_type, 1, 1, ss_x, ss_y);
+
+#if CONFIG_SDP
+    rd_pick_rect_partition(cpi, td, tile_data, tp, x, pc_tree,
+                           part_search_state, best_rdc, i, mi_pos_rect,
+                           blk_params.subsize, is_not_edge_block,
+                           multi_pass_mode, ptree_luma, template_tree);
+#else
+    rd_pick_rect_partition(
+        cpi, td, tile_data, tp, x, pc_tree, part_search_state, best_rdc, i,
+        mi_pos_rect, blk_params.subsize, is_not_edge_block, multi_pass_mode);
+#endif  // CONFIG_SDP
+#else
+    int sub_part_idx = 0;
     for (int j = 0; j < SUB_PARTITIONS_RECT; j++) {
+      assert(cur_ctx[i][j] != NULL);
       if (cur_ctx[i][j][0] == NULL) {
-        cur_ctx[i][j][0] =
-            av1_alloc_pmc(cm, blk_params.subsize, &td->shared_coeff_buf);
+        cur_ctx[i][j][0] = av1_alloc_pmc(
+            cm, mi_pos_rect[i][j][0], mi_pos_rect[i][j][1], blk_params.subsize,
+            pc_tree, partition_type, j, part_search_state->ss_x,
+            part_search_state->ss_y, &td->shared_coeff_buf);
       }
     }
     sum_rdc->rate = part_search_state->partition_cost[partition_type];
@@ -2462,6 +3304,7 @@
           best_rdc, 1, mi_pos_rect[i][sub_part_idx][0],
           mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
     }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 #if CONFIG_COLLECT_PARTITION_STATS
     if (partition_timer_on) {
       aom_usec_timer_mark(&partition_timer);
@@ -2483,11 +3326,12 @@
       if (rect_part_win_info != NULL)
         rect_part_win_info->rect_part_win[i] = false;
     }
-    av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+    av1_restore_context(cm, x, x_ctx, blk_params.mi_row, blk_params.mi_col,
                         blk_params.bsize, av1_num_planes(cm));
   }
 }
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
 // AB partition type evaluation.
 static void rd_pick_ab_part(
     AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
@@ -2530,7 +3374,7 @@
     partition_timer_on = 0;
   }
 #endif
-  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+  av1_restore_context(cm, x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
 }
 
 // Check if AB partitions search is allowed.
@@ -2647,9 +3491,13 @@
 
     blk_params.subsize = get_partition_subsize(bsize, part_type);
     for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+      assert(cur_part_ctxs[ab_part_type] != NULL);
       // Set AB partition context.
-      cur_part_ctxs[ab_part_type][i] =
-          av1_alloc_pmc(cm, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+      cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
+          cm, ab_mi_pos[ab_part_type][i][0], ab_mi_pos[ab_part_type][i][1],
+          ab_subsize[ab_part_type][i], pc_tree, part_type, i,
+          part_search_state->ss_x, part_search_state->ss_y,
+          &td->shared_coeff_buf);
       // Set mode as not ready.
       cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
     }
@@ -2691,7 +3539,7 @@
     MACROBLOCK *x, const AV1_COMMON *const cm, ThreadData *td,
     PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
     PartitionSearchState *part_search_state, PARTITION_TYPE partition_type,
-    BLOCK_SIZE bsize) {
+    BLOCK_SIZE bsize, int mi_pos[SUB_PARTITIONS_PART4][2], PC_TREE *pc_tree) {
   // Initialize sum_rdc RD cost structure.
   av1_init_rd_stats(&part_search_state->sum_rdc);
   const int subsize = get_partition_subsize(bsize, partition_type);
@@ -2699,8 +3547,12 @@
       part_search_state->partition_cost[partition_type];
   part_search_state->sum_rdc.rdcost =
       RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0);
-  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i)
-    cur_part_ctx[i] = av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    cur_part_ctx[i] =
+        av1_alloc_pmc(cm, mi_pos[i][0], mi_pos[i][1], subsize, pc_tree,
+                      partition_type, i, part_search_state->ss_x,
+                      part_search_state->ss_y, &td->shared_coeff_buf);
+  }
 }
 
 // Partition search of HORZ4 / VERT4 partition types.
@@ -2719,11 +3571,11 @@
   int mi_pos[SUB_PARTITIONS_PART4][2];
 
   blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type);
-  // Set partition context and RD cost.
-  set_4_part_ctx_and_rdcost(x, cm, td, cur_part_ctx, part_search_state,
-                            partition_type, blk_params.bsize);
   // Set mi positions for sub-block sizes.
   set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col);
+  // Set partition context and RD cost.
+  set_4_part_ctx_and_rdcost(x, cm, td, cur_part_ctx, part_search_state,
+                            partition_type, blk_params.bsize, mi_pos, pc_tree);
 #if CONFIG_COLLECT_PARTITION_STATS
   if (best_rdc.rdcost - part_search_state->sum_rdc.rdcost >= 0) {
     partition_attempts[partition_type] += 1;
@@ -2761,7 +3613,7 @@
     partition_timer_on = 0;
   }
 #endif
-  av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+  av1_restore_context(cm, x, x_ctx, blk_params.mi_row, blk_params.mi_col,
                       blk_params.bsize, av1_num_planes(cm));
 }
 
@@ -2822,7 +3674,8 @@
                              part_search_state->ss_x,
                              part_search_state->ss_y) != BLOCK_INVALID;
   }
-  // Pruning: pruning out 4-way partitions based on the current best partition.
+  // Pruning: pruning out 4-way partitions based on the current best
+  // partition.
   if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
     part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ ||
                                     pc_tree->partitioning == PARTITION_HORZ_A ||
@@ -2848,18 +3701,34 @@
         pb_source_variance, mi_row, mi_col);
   }
 
-  // Pruning: pruning out 4-way partitions based on the number of horz/vert wins
-  // in the current block and sub-blocks in PARTITION_SPLIT.
+  // Pruning: pruning out 4-way partitions based on the number of horz/vert
+  // wins in the current block and sub-blocks in PARTITION_SPLIT.
   prune_4_partition_using_split_info(cpi, x, part_search_state,
                                      part4_search_allowed);
 }
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
 // Set PARTITION_NONE allowed flag.
 static AOM_INLINE void set_part_none_allowed_flag(
-    AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
+    const AV1_COMP *const cpi,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+    TREE_TYPE tree_type,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+    PartitionSearchState *part_search_state) {
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  if (tree_type == CHROMA_PART && blk_params.bsize == BLOCK_8X8) {
+    part_search_state->partition_none_allowed = 1;
+    return;
+  }
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (is_bsize_geq(blk_params.min_partition_size, blk_params.bsize) &&
+      blk_params.has_rows && blk_params.has_cols)
+#else
   if ((blk_params.width <= blk_params.min_partition_size_1d) &&
       blk_params.has_rows && blk_params.has_cols)
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     part_search_state->partition_none_allowed = 1;
   assert(part_search_state->terminate_partition_search == 0);
 
@@ -2887,11 +3756,14 @@
   RD_STATS partition_rdcost;
   // Set PARTITION_NONE context.
   if (pc_tree->none == NULL)
-    pc_tree->none = av1_alloc_pmc(cm, blk_params.bsize, &td->shared_coeff_buf);
+    pc_tree->none = av1_alloc_pmc(
+        cm, blk_params.mi_row, blk_params.mi_col, blk_params.bsize, pc_tree,
+        PARTITION_NONE, 0, part_search_state->ss_x, part_search_state->ss_y,
+        &td->shared_coeff_buf);
 
   // Set PARTITION_NONE type cost.
   if (part_search_state->partition_none_allowed) {
-    if (blk_params.bsize_at_least_8x8) {
+    if (part_search_state->is_block_splittable) {
       *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX
                      ? part_search_state->partition_cost[PARTITION_NONE]
                      : 0;
@@ -2916,22 +3788,37 @@
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
+#if !CONFIG_EXT_RECUR_PARTITIONS
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   RD_STATS *this_rdc = &part_search_state->this_rdc;
   const BLOCK_SIZE bsize = blk_params.bsize;
   assert(bsize < BLOCK_SIZES_ALL);
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  (void)sms_tree;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+
   if (!frame_is_intra_only(cm) &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+      part_search_state->do_rectangular_split &&
+#else
       (part_search_state->do_square_split ||
        part_search_state->do_rectangular_split) &&
+#endif
       !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
     const int use_ml_based_breakout =
         bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
+#if CONFIG_EXT_RECUR_PARTITIONS
+        is_square_block(bsize) &&
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
         bsize > BLOCK_4X4 && xd->bd == 8;
     if (use_ml_based_breakout) {
       if (av1_ml_predict_breakout(cpi, bsize, x, this_rdc,
                                   *pb_source_variance)) {
+#if !CONFIG_EXT_RECUR_PARTITIONS
         part_search_state->do_square_split = 0;
+#endif
         part_search_state->do_rectangular_split = 0;
       }
     }
@@ -2951,11 +3838,14 @@
     // disable the early termination at that speed.
     if (best_rdc->dist < dist_breakout_thr &&
         best_rdc->rate < rate_breakout_thr) {
+#if !CONFIG_EXT_RECUR_PARTITIONS
       part_search_state->do_square_split = 0;
+#endif
       part_search_state->do_rectangular_split = 0;
     }
   }
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
   // Early termination: using simple_motion_search features and the
   // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a
   // decision on early terminating at PARTITION_NONE.
@@ -2971,8 +3861,10 @@
         cpi, x, sms_tree, blk_params.mi_row, blk_params.mi_col, bsize, this_rdc,
         &part_search_state->terminate_partition_search);
   }
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 }
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
 // Decide early termination and rectangular partition pruning
 // based on PARTITION_NONE and PARTITION_SPLIT costs.
 static void prune_partitions_after_split(
@@ -2986,6 +3878,13 @@
   const BLOCK_SIZE bsize = blk_params.bsize;
   assert(bsize < BLOCK_SIZES_ALL);
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  (void)sms_tree;
+  (void)part_none_rd;
+  (void)part_split_rd;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+
+#if !CONFIG_EXT_RECUR_PARTITIONS
   // Early termination: using the rd costs of PARTITION_NONE and subblocks
   // from PARTITION_SPLIT to determine an early breakout.
   if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
@@ -2999,6 +3898,7 @@
         part_search_state->split_rd, mi_row, mi_col,
         &part_search_state->terminate_partition_search);
   }
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
   // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT
   // to prune out rectangular partitions in some directions.
@@ -3010,13 +3910,14 @@
         part_search_state->prune_rect_part[VERT]) &&
       !part_search_state->terminate_partition_search) {
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm),
-                         bsize);
+                         NULL);
     av1_ml_prune_rect_partition(
         cpi, x, bsize, best_rdc->rdcost, part_search_state->none_rd,
         part_search_state->split_rd, &part_search_state->prune_rect_part[HORZ],
         &part_search_state->prune_rect_part[VERT]);
   }
 }
+#endif
 
 // PARTITION_NONE search.
 static void none_partition_search(
@@ -3033,8 +3934,15 @@
   const BLOCK_SIZE bsize = blk_params.bsize;
   assert(bsize < BLOCK_SIZES_ALL);
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  (void)part_none_rd;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   // Set PARTITION_NONE allowed flag.
-  set_part_none_allowed_flag(cpi, part_search_state);
+  set_part_none_allowed_flag(cpi,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                             x->e_mbd.tree_type,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                             part_search_state);
   if (!part_search_state->partition_none_allowed) return;
 
   int pt_cost = 0;
@@ -3052,9 +3960,21 @@
     partition_timer_on = 1;
   }
 #endif
+#if CONFIG_EXT_RECUR_PARTITIONS
+  SimpleMotionData *sms_data = av1_get_sms_data_entry(
+      x->sms_bufs, mi_row, mi_col, bsize, cm->seq_params.sb_size);
+  av1_set_best_mode_cache(x, sms_data->mode_cache);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
   // PARTITION_NONE evaluation and cost update.
   pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE,
                 bsize, pc_tree->none, best_remain_rdcost);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  x->inter_mode_cache = NULL;
+  if (this_rdc->rate != INT_MAX) {
+    av1_add_mode_search_context_to_cache(sms_data, pc_tree->none);
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   av1_rd_cost_update(x->rdmult, this_rdc);
 
 #if CONFIG_COLLECT_PARTITION_STATS
@@ -3078,17 +3998,23 @@
     }
 
     // Calculate the total cost and update the best partition.
-    if (blk_params.bsize_at_least_8x8) {
+    if (part_search_state->is_block_splittable) {
       this_rdc->rate += pt_cost;
       this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist);
     }
+#if !CONFIG_EXT_RECUR_PARTITIONS
     *part_none_rd = this_rdc->rdcost;
+#endif
     if (this_rdc->rdcost < best_rdc->rdcost) {
       *best_rdc = *this_rdc;
       part_search_state->found_best_partition = true;
+#if !CONFIG_EXT_RECUR_PARTITIONS
       if (blk_params.bsize_at_least_8x8) {
         pc_tree->partitioning = PARTITION_NONE;
       }
+#else
+      pc_tree->partitioning = PARTITION_NONE;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
       // Disable split and rectangular partition search
       // based on PARTITION_NONE cost.
@@ -3097,9 +4023,10 @@
                                   pb_source_variance);
     }
   }
-  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+  av1_restore_context(cm, x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
 }
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
 // PARTITION_SPLIT search.
 static void split_partition_search(
     AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
@@ -3122,12 +4049,6 @@
       !part_search_state->do_square_split)
     return;
 
-  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
-    if (pc_tree->split[i] == NULL)
-      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
-    pc_tree->split[i]->index = i;
-  }
-
   // Initialization of this partition RD stats.
   av1_init_rd_stats(&sum_rdc);
   sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT];
@@ -3151,7 +4072,11 @@
         mi_col + x_idx >= mi_params->mi_cols)
       continue;
 
-    pc_tree->split[idx]->index = idx;
+    if (pc_tree->split[idx] == NULL) {
+      pc_tree->split[idx] = av1_alloc_pc_tree_node(
+          mi_row + y_idx, mi_col + x_idx, subsize, pc_tree, PARTITION_SPLIT,
+          idx, idx == 3, part_search_state->ss_x, part_search_state->ss_y);
+    }
     int64_t *p_split_rd = &part_search_state->split_rd[idx];
     RD_STATS best_remain_rdcost;
     av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc,
@@ -3224,9 +4149,453 @@
           !(partition_none_valid && partition_none_better);
     }
   }
-  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+  av1_restore_context(cm, x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+/*!\cond */
+typedef struct {
+  SIMPLE_MOTION_DATA_TREE *sms_tree;
+  PC_TREE *pc_tree;
+#if CONFIG_SDP
+  const PARTITION_TREE *ptree_luma;
+  const PARTITION_TREE *template_tree;
+#endif  // CONFIG_SDP
+  PICK_MODE_CONTEXT *ctx;
+  int mi_row;
+  int mi_col;
+  BLOCK_SIZE bsize;
+  PARTITION_TYPE partition;
+  int is_last_subblock;
+  int is_splittable;
+} SUBBLOCK_RDO_DATA;
+/*!\endcond */
+
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks)
+static int rd_try_subblock_new(AV1_COMP *const cpi, ThreadData *td,
+                               TileDataEnc *tile_data, TokenExtra **tp,
+                               SUBBLOCK_RDO_DATA *rdo_data,
+                               RD_STATS best_rdcost, RD_STATS *sum_rdc,
+                               SB_MULTI_PASS_MODE multi_pass_mode) {
+  MACROBLOCK *const x = &td->mb;
+  const int orig_mult = x->rdmult;
+  const int mi_row = rdo_data->mi_row;
+  const int mi_col = rdo_data->mi_col;
+  const BLOCK_SIZE bsize = rdo_data->bsize;
+
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  av1_rd_cost_update(x->rdmult, &best_rdcost);
+
+  RD_STATS rdcost_remaining;
+  av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
+  RD_STATS this_rdc;
+
+  if (rdo_data->is_splittable) {
+    if (!av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                               &this_rdc, rdcost_remaining, rdo_data->pc_tree,
+#if CONFIG_SDP
+                               rdo_data->ptree_luma, rdo_data->template_tree,
+#endif  // CONFIG_SDP
+                               rdo_data->sms_tree, NULL, multi_pass_mode,
+                               NULL)) {
+      av1_invalid_rd_stats(sum_rdc);
+      return 0;
+    }
+  } else {
+    const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+    SimpleMotionData *sms_data =
+        av1_get_sms_data_entry(x->sms_bufs, mi_row, mi_col, bsize, sb_size);
+    av1_set_best_mode_cache(x, sms_data->mode_cache);
+
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                  rdo_data->partition, bsize, rdo_data->ctx, rdcost_remaining);
+
+    x->inter_mode_cache = NULL;
+    if (this_rdc.rate != INT_MAX) {
+      av1_add_mode_search_context_to_cache(sms_data, rdo_data->ctx);
+    }
+  }
+
+  if (this_rdc.rate == INT_MAX) {
+    sum_rdc->rdcost = INT64_MAX;
+  } else {
+    sum_rdc->rate += this_rdc.rate;
+    sum_rdc->dist += this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, sum_rdc);
+  }
+
+  if (sum_rdc->rdcost >= best_rdcost.rdcost) {
+    x->rdmult = orig_mult;
+    return 0;
+  }
+
+  if (!rdo_data->is_last_subblock && !rdo_data->is_splittable) {
+    av1_update_state(cpi, td, rdo_data->ctx, mi_row, mi_col, bsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, bsize, NULL);
+  }
+
+  x->rdmult = orig_mult;
+  return 1;
 }
 
+static INLINE void search_partition_horz_3(
+    PartitionSearchState *search_state, AV1_COMP *const cpi, ThreadData *td,
+    TileDataEnc *tile_data, TokenExtra **tp, RD_STATS *best_rdc,
+    PC_TREE *pc_tree,
+#if CONFIG_SDP
+    const PARTITION_TREE *ptree_luma, const PARTITION_TREE *template_tree,
+#endif  // CONFIG_SDP
+    RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, SB_MULTI_PASS_MODE multi_pass_mode) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const int num_planes = av1_num_planes(cm);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
+  const PartitionBlkParams *blk_params = &search_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  const BLOCK_SIZE sml_subsize = get_partition_subsize(bsize, PARTITION_HORZ_3);
+  const BLOCK_SIZE big_subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+
+  if (search_state->terminate_partition_search || !blk_params->has_rows ||
+      !is_partition_valid(bsize, PARTITION_HORZ_3) ||
+      !(search_state->do_rectangular_split ||
+        av1_active_h_edge(cpi, mi_row, blk_params->mi_step_h))) {
+    return;
+  }
+
+  const int part_h3_rate = search_state->partition_cost[PARTITION_HORZ_3];
+  if (part_h3_rate == INT_MAX ||
+      RDCOST(x->rdmult, part_h3_rate, 0) >= best_rdc->rdcost) {
+    return;
+  }
+  RD_STATS sum_rdc;
+  av1_init_rd_stats(&sum_rdc);
+  const int quarter_step = mi_size_high[bsize] / 4;
+
+  sum_rdc.rate = search_state->partition_cost[PARTITION_HORZ_3];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+  const int step_multipliers[3] = { 0, 1, 2 };
+  const BLOCK_SIZE subblock_sizes[3] = { sml_subsize, big_subsize,
+                                         sml_subsize };
+
+  for (int idx = 0; idx < 3; idx++) {
+    if (pc_tree->horizontal3[idx]) {
+      av1_free_pc_tree_recursive(pc_tree->horizontal3[idx], num_planes, 0, 0);
+      pc_tree->horizontal3[idx] = NULL;
+    }
+  }
+  pc_tree->horizontal3[0] =
+      av1_alloc_pc_tree_node(mi_row, mi_col, subblock_sizes[0], pc_tree,
+                             PARTITION_HORZ_3, 0, 0, ss_x, ss_y);
+  pc_tree->horizontal3[1] =
+      av1_alloc_pc_tree_node(mi_row + quarter_step, mi_col, subblock_sizes[1],
+                             pc_tree, PARTITION_HORZ_3, 1, 0, ss_x, ss_y);
+  pc_tree->horizontal3[2] = av1_alloc_pc_tree_node(
+      mi_row + quarter_step * 3, mi_col, subblock_sizes[2], pc_tree,
+      PARTITION_HORZ_3, 2, 1, ss_x, ss_y);
+
+  if (cpi->sf.part_sf.enable_fast_erp && !frame_is_intra_only(cm) &&
+      !x->must_find_valid_partition && is_bsize_pruning_cand(bsize)) {
+    const SimpleMotionData *up = av1_get_sms_data(
+        cpi, &tile_data->tile_info, x, mi_row, mi_col, subblock_sizes[0]);
+    const SimpleMotionData *middle =
+        av1_get_sms_data(cpi, &tile_data->tile_info, x, mi_row + quarter_step,
+                         mi_col, subblock_sizes[1]);
+    const SimpleMotionData *down =
+        av1_get_sms_data(cpi, &tile_data->tile_info, x,
+                         mi_row + 3 * quarter_step, mi_col, subblock_sizes[2]);
+
+    SMSPartitionStats part_data;
+    part_data.sms_data[0] = up;
+    part_data.sms_data[1] = middle;
+    part_data.sms_data[2] = down;
+    part_data.num_sub_parts = 3;
+    part_data.part_rate = part_h3_rate;
+
+    if (best_rdc->rdcost < INT64_MAX &&
+        (blk_params->mi_row + 2 * (blk_params->mi_step_h) <=
+         cm->mi_params.mi_rows) &&
+        (blk_params->mi_col + 2 * (blk_params->mi_step_w) <=
+         cm->mi_params.mi_cols) &&
+        av1_prune_new_part(&search_state->none_data, &part_data, x->rdmult,
+                           blk_params->bsize, &cpi->sf)) {
+      const BLOCK_SIZE midsize = subblock_sizes[1];
+      const BLOCK_SIZE subsubsize =
+          get_partition_subsize(midsize, PARTITION_VERT);
+      if (subsubsize == BLOCK_INVALID) {
+        return;
+      }
+
+      // Do one more check to deal with recursion
+      SMSPartitionStats subpart_data;
+      const SimpleMotionData *midleft =
+          av1_get_sms_data(cpi, &tile_data->tile_info, x, mi_row + quarter_step,
+                           mi_col + 2 * quarter_step, subsubsize);
+      const SimpleMotionData *midright =
+          av1_get_sms_data(cpi, &tile_data->tile_info, x, mi_row + quarter_step,
+                           mi_col + 2 * quarter_step, subsubsize);
+      subpart_data.sms_data[0] = up;
+      subpart_data.sms_data[1] = midleft;
+      subpart_data.sms_data[2] = midright;
+      subpart_data.sms_data[3] = down;
+      subpart_data.num_sub_parts = 4;
+      subpart_data.part_rate = 0;
+      if (av1_prune_new_part(&search_state->none_data, &subpart_data, x->rdmult,
+                             bsize, &cpi->sf)) {
+        return;
+      }
+    }
+  }
+
+  int this_mi_row = mi_row;
+  for (int i = 0; i < 3; ++i) {
+    this_mi_row += quarter_step * step_multipliers[i];
+
+    if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break;
+
+    SUBBLOCK_RDO_DATA rdo_data = {
+      NULL,
+      pc_tree->horizontal3[i],
+#if CONFIG_SDP
+      get_partition_subtree_const(ptree_luma, i),
+      get_partition_subtree_const(template_tree, i),
+#endif  // CONFIG_SDP
+      NULL,
+      this_mi_row,
+      mi_col,
+      subblock_sizes[i],
+      PARTITION_HORZ_3,
+      i == 2,
+      1
+    };
+    if (!rd_try_subblock_new(cpi, td, tile_data, tp, &rdo_data, *best_rdc,
+                             &sum_rdc, multi_pass_mode)) {
+      av1_invalid_rd_stats(&sum_rdc);
+      break;
+    }
+  }
+
+  av1_rd_cost_update(x->rdmult, &sum_rdc);
+  if (sum_rdc.rdcost < best_rdc->rdcost) {
+    *best_rdc = sum_rdc;
+    search_state->found_best_partition = true;
+    pc_tree->partitioning = PARTITION_HORZ_3;
+  }
+
+  av1_restore_context(cm, x, x_ctx, mi_row, mi_col, bsize, num_planes);
+}
+
+static INLINE void search_partition_vert_3(
+    PartitionSearchState *search_state, AV1_COMP *const cpi, ThreadData *td,
+    TileDataEnc *tile_data, TokenExtra **tp, RD_STATS *best_rdc,
+    PC_TREE *pc_tree,
+#if CONFIG_SDP
+    const PARTITION_TREE *ptree_luma, const PARTITION_TREE *template_tree,
+#endif  // CONFIG_SDP
+    RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, SB_MULTI_PASS_MODE multi_pass_mode) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const int num_planes = av1_num_planes(cm);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
+  const PartitionBlkParams *blk_params = &search_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  const BLOCK_SIZE sml_subsize = get_partition_subsize(bsize, PARTITION_VERT_3);
+  const BLOCK_SIZE big_subsize = get_partition_subsize(bsize, PARTITION_VERT);
+
+  if (search_state->terminate_partition_search || !blk_params->has_cols ||
+      !is_partition_valid(bsize, PARTITION_VERT_3) ||
+      !(search_state->do_rectangular_split ||
+        av1_active_v_edge(cpi, mi_row, blk_params->mi_step_h))) {
+    return;
+  }
+
+  const int part_v3_rate = search_state->partition_cost[PARTITION_VERT_3];
+  if (part_v3_rate == INT_MAX ||
+      RDCOST(x->rdmult, part_v3_rate, 0) >= best_rdc->rdcost) {
+    return;
+  }
+
+  RD_STATS sum_rdc;
+  av1_init_rd_stats(&sum_rdc);
+  const int quarter_step = mi_size_wide[bsize] / 4;
+
+  sum_rdc.rate = search_state->partition_cost[PARTITION_VERT_3];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+  const int step_multipliers[3] = { 0, 1, 2 };
+  const BLOCK_SIZE subblock_sizes[3] = { sml_subsize, big_subsize,
+                                         sml_subsize };
+
+  for (int idx = 0; idx < 3; idx++) {
+    if (pc_tree->vertical3[idx]) {
+      av1_free_pc_tree_recursive(pc_tree->vertical3[idx], num_planes, 0, 0);
+      pc_tree->vertical3[idx] = NULL;
+    }
+  }
+  pc_tree->vertical3[0] =
+      av1_alloc_pc_tree_node(mi_row, mi_col, subblock_sizes[0], pc_tree,
+                             PARTITION_VERT_3, 0, 0, ss_x, ss_y);
+  pc_tree->vertical3[1] =
+      av1_alloc_pc_tree_node(mi_row, mi_col + quarter_step, subblock_sizes[1],
+                             pc_tree, PARTITION_VERT_3, 1, 0, ss_x, ss_y);
+  pc_tree->vertical3[2] = av1_alloc_pc_tree_node(
+      mi_row, mi_col + quarter_step * 3, subblock_sizes[2], pc_tree,
+      PARTITION_VERT_3, 2, 1, ss_x, ss_y);
+
+  if (cpi->sf.part_sf.enable_fast_erp && !frame_is_intra_only(cm) &&
+      !x->must_find_valid_partition && is_bsize_pruning_cand(bsize)) {
+    const SimpleMotionData *left = av1_get_sms_data(
+        cpi, &tile_data->tile_info, x, mi_row, mi_col, subblock_sizes[0]);
+    const SimpleMotionData *middle =
+        av1_get_sms_data(cpi, &tile_data->tile_info, x, mi_row,
+                         mi_col + quarter_step, subblock_sizes[1]);
+    const SimpleMotionData *right =
+        av1_get_sms_data(cpi, &tile_data->tile_info, x, mi_row,
+                         mi_col + 3 * quarter_step, subblock_sizes[2]);
+
+    SMSPartitionStats part_data;
+    part_data.sms_data[0] = left;
+    part_data.sms_data[1] = middle;
+    part_data.sms_data[2] = right;
+    part_data.num_sub_parts = 3;
+    part_data.part_rate = part_v3_rate;
+
+    if (best_rdc->rdcost < INT64_MAX &&
+        (blk_params->mi_row + 2 * blk_params->mi_step_h <=
+         cm->mi_params.mi_rows) &&
+        (blk_params->mi_col + 2 * blk_params->mi_step_w <=
+         cm->mi_params.mi_cols) &&
+        av1_prune_new_part(&search_state->none_data, &part_data, x->rdmult,
+                           blk_params->bsize, &cpi->sf)) {
+      const BLOCK_SIZE midsize = subblock_sizes[1];
+      const BLOCK_SIZE subsubsize =
+          get_partition_subsize(midsize, PARTITION_HORZ);
+      if (subsubsize == BLOCK_INVALID) {
+        return;
+      }
+
+      // Do one more check to deal with recursion
+      SMSPartitionStats subpart_data;
+      const SimpleMotionData *leftmid =
+          av1_get_sms_data(cpi, &tile_data->tile_info, x, mi_row,
+                           mi_col + quarter_step, subsubsize);
+      const SimpleMotionData *rightmid = av1_get_sms_data(
+          cpi, &tile_data->tile_info, x, mi_row + 2 * quarter_step,
+          mi_col + quarter_step, subsubsize);
+      subpart_data.sms_data[0] = left;
+      subpart_data.sms_data[1] = leftmid;
+      subpart_data.sms_data[2] = rightmid;
+      subpart_data.sms_data[3] = right;
+      subpart_data.num_sub_parts = 4;
+      subpart_data.part_rate = 0;
+      if (av1_prune_new_part(&search_state->none_data, &subpart_data, x->rdmult,
+                             bsize, &cpi->sf)) {
+        return;
+      }
+    }
+  }
+
+  int this_mi_col = mi_col;
+  for (int i = 0; i < 3; ++i) {
+    this_mi_col += quarter_step * step_multipliers[i];
+
+    if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break;
+
+    SUBBLOCK_RDO_DATA rdo_data = {
+      NULL,
+      pc_tree->vertical3[i],
+#if CONFIG_SDP
+      get_partition_subtree_const(ptree_luma, i),
+      get_partition_subtree_const(template_tree, i),
+#endif  // CONFIG_SDP
+      NULL,
+      mi_row,
+      this_mi_col,
+      subblock_sizes[i],
+      PARTITION_VERT_3,
+      i == 2,
+      1
+    };
+    if (!rd_try_subblock_new(cpi, td, tile_data, tp, &rdo_data, *best_rdc,
+                             &sum_rdc, multi_pass_mode)) {
+      av1_invalid_rd_stats(&sum_rdc);
+      break;
+    }
+  }
+
+  av1_rd_cost_update(x->rdmult, &sum_rdc);
+  if (sum_rdc.rdcost < best_rdc->rdcost) {
+    *best_rdc = sum_rdc;
+    search_state->found_best_partition = true;
+    pc_tree->partitioning = PARTITION_VERT_3;
+  }
+  av1_restore_context(cm, x, x_ctx, mi_row, mi_col, bsize, num_planes);
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+/*!\brief AV1 block partition search (full search).
+*
+* \ingroup partition_search
+* \callgraph
+* Searches for the best partition pattern for a block based on the
+* rate-distortion cost, and returns a bool value to indicate whether a valid
+* partition pattern is found. The partition can recursively go down to the
+* smallest block size.
+*
+* \param[in]    cpi                Top-level encoder structure
+* \param[in]    td                 Pointer to thread data
+* \param[in]    tile_data          Pointer to struct holding adaptive
+data/contexts/models for the tile during
+encoding
+* \param[in]    tp                 Pointer to the starting token
+* \param[in]    mi_row             Row coordinate of the block in a step size
+of MI_SIZE
+* \param[in]    mi_col             Column coordinate of the block in a step
+size of MI_SIZE
+* \param[in]    bsize              Current block size
+* \param[in]    rd_cost            Pointer to the final rd cost of the block
+* \param[in]    best_rdc           Upper bound of rd cost of a valid partition
+* \param[in]    pc_tree            Pointer to the PC_TREE node storing the
+picked partitions and mode info for the
+current block
+* \param[in]    ptree_luma Pointer to the luma partition tree so that the
+*                          encoder to estimate the partition type for chroma.
+* \param[in]    template_tree      A partial tree that contains the partition
+*                                  structure to be used as a template.
+* \param[in]    sms_tree           Pointer to struct holding simple motion
+search data for the current block
+* \param[in]    none_rd            Pointer to the rd cost in the case of not
+splitting the current block
+* \param[in]    multi_pass_mode    SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
+* \param[in]    rect_part_win_info Pointer to struct storing whether horz/vert
+partition outperforms previously tested
+partitions
+*
+* \return A bool value is returned indicating if a valid partition is found.
+* The pc_tree struct is modified to store the picked partition and modes.
+* The rd_cost struct is also updated with the RD stats corresponding to the
+* best partition found.
+*/
+#else
 /*!\brief AV1 block partition search (full search).
 *
 * \ingroup partition_search
@@ -3266,10 +4635,15 @@
 * The rd_cost struct is also updated with the RD stats corresponding to the
 * best partition found.
 */
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
 bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
                            TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
                            int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
                            RD_STATS best_rdc, PC_TREE *pc_tree,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                           const PARTITION_TREE *ptree_luma,
+                           const PARTITION_TREE *template_tree,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
                            SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
                            SB_MULTI_PASS_MODE multi_pass_mode,
                            RD_RECT_PART_WIN_INFO *rect_part_win_info) {
@@ -3282,15 +4656,57 @@
   const TokenExtra *const tp_orig = *tp;
   PartitionSearchState part_search_state;
   // Initialization of state variables used in partition search.
-  init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
-                                     bsize);
+  init_partition_search_state_params(x, cpi, &part_search_state,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                                     pc_tree,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                                     mi_row, mi_col, bsize);
   PartitionBlkParams blk_params = part_search_state.part_blk_params;
-
-  sms_tree->partitioning = PARTITION_NONE;
+#if CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_SDP
+  const PARTITION_TYPE forced_partition =
+      get_forced_partition_type(cm, x, mi_row, mi_col, bsize, template_tree);
+#else
+  const PARTITION_TYPE forced_partition =
+      get_forced_partition_type(cm, x, mi_row, mi_col, bsize);
+#endif  // CONFIG_SDP
+  if (sms_tree != NULL)
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    sms_tree->partitioning = PARTITION_NONE;
   if (best_rdc.rdcost < 0) {
     av1_invalid_rd_stats(rd_cost);
     return part_search_state.found_best_partition;
   }
+#if CONFIG_EXT_RECUR_PARTITIONS
+  // Check whether there is a counterpart pc_tree node with the same size
+  // and the same neighboring context at the same location but from a
+  // different partition path. If yes directly copy the RDO decision made for
+  // the counterpart.
+  PC_TREE *counterpart_block = av1_look_for_counterpart_block(pc_tree);
+  if (counterpart_block) {
+    if (counterpart_block->rd_cost.rate != INT_MAX) {
+      av1_copy_pc_tree_recursive(cm, pc_tree, counterpart_block,
+                                 part_search_state.ss_x, part_search_state.ss_y,
+                                 &td->shared_coeff_buf, num_planes);
+      *rd_cost = pc_tree->rd_cost;
+      assert(bsize != cm->seq_params.sb_size);
+      if (bsize == cm->seq_params.sb_size) exit(0);
+
+      if (!pc_tree->is_last_subblock) {
+        encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                  pc_tree, NULL,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                  NULL,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                  NULL);
+      }
+      return true;
+    } else {
+      av1_invalid_rd_stats(rd_cost);
+      return false;
+    }
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
 
   // Override skipping rectangular partition operations for edge blocks.
@@ -3334,10 +4750,14 @@
          sizeof(x->txfm_search_info.blk_skip));
 #endif  // NDEBUG
 
+  assert(bsize < BLOCK_SIZES_ALL);
+#if !CONFIG_EXT_RECUR_PARTITIONS
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
   // Set buffers and offsets.
-  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize,
+                  &pc_tree->chroma_ref_info);
 
   // Save rdmult before it might be changed, so it can be restored later.
   const int orig_rdmult = x->rdmult;
@@ -3358,6 +4778,7 @@
 
   int *partition_horz_allowed = &part_search_state.partition_rect_allowed[HORZ];
   int *partition_vert_allowed = &part_search_state.partition_rect_allowed[VERT];
+#if !CONFIG_EXT_RECUR_PARTITIONS
   int *prune_horz = &part_search_state.prune_rect_part[HORZ];
   int *prune_vert = &part_search_state.prune_rect_part[VERT];
   // Pruning: before searching any partition type, using source and simple
@@ -3367,17 +4788,45 @@
       &part_search_state.partition_none_allowed, partition_horz_allowed,
       partition_vert_allowed, &part_search_state.do_rectangular_split,
       &part_search_state.do_square_split, prune_horz, prune_vert);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
-  // Pruning: eliminating partition types leading to coding block sizes outside
-  // the min and max bsize limitations set from the encoder.
+  // Pruning: eliminating partition types leading to coding block sizes
+  // outside the min and max bsize limitations set from the encoder.
   av1_prune_partitions_by_max_min_bsize(
       &x->sb_enc, bsize, blk_params.has_rows && blk_params.has_cols,
       &part_search_state.partition_none_allowed, partition_horz_allowed,
+#if CONFIG_EXT_RECUR_PARTITIONS
+      partition_vert_allowed, NULL);
+#else
       partition_vert_allowed, &part_search_state.do_square_split);
+#endif
 
 #if CONFIG_SDP
   int luma_split_flag = 0;
-  int parent_block_width = block_size_wide[bsize];
+  const int parent_block_width = block_size_wide[bsize];
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int min_bsize_1d = AOMMIN(block_size_high[bsize], parent_block_width);
+  int horz_3_allowed_sdp = 1;
+  int vert_3_allowed_sdp = 1;
+  if (xd->tree_type == CHROMA_PART && min_bsize_1d >= SHARED_PART_SIZE &&
+      ptree_luma) {
+    PARTITION_TYPE derived_partition_mode = sdp_chroma_part_from_luma(
+        bsize, ptree_luma->partition, part_search_state.ss_x,
+        part_search_state.ss_y);
+
+    if (derived_partition_mode != PARTITION_NONE)
+      part_search_state.partition_none_allowed = BLOCK_INVALID;
+    if (derived_partition_mode != PARTITION_HORZ)
+      part_search_state.partition_rect_allowed[HORZ] = 0;
+    if (derived_partition_mode != PARTITION_VERT)
+      part_search_state.partition_rect_allowed[VERT] = 0;
+    if (derived_partition_mode != PARTITION_HORZ_3) horz_3_allowed_sdp = 0;
+    if (derived_partition_mode != PARTITION_VERT_3) vert_3_allowed_sdp = 0;
+
+    // TODO(yuec): Need to make sure there is at least one valid partition
+    // mode
+  }
+#else   // CONFIG_EXT_RECUR_PARTITIONS
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   if (xd->tree_type == CHROMA_PART && parent_block_width >= SHARED_PART_SIZE) {
     luma_split_flag = get_luma_split_flag(bsize, mi_params, mi_row, mi_col);
@@ -3388,25 +4837,50 @@
     part_search_state.partition_rect_allowed[HORZ] = 0;
     part_search_state.partition_rect_allowed[VERT] = 0;
   }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 #endif
 
   // Partition search
 BEGIN_PARTITION_SEARCH:
-  // If a valid partition is required, usually when the first round cannot find
-  // a valid one under the cost limit after pruning, reset the limitations on
-  // partition types.
+  // If a valid partition is required, usually when the first round cannot
+  // find a valid one under the cost limit after pruning, reset the
+  // limitations on partition types.
   if (x->must_find_valid_partition)
-    reset_part_limitations(cpi, &part_search_state);
+    reset_part_limitations(cpi,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                           xd->tree_type,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+#if CONFIG_EXT_RECUR_PARTITIONS
+                           pc_tree,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                           &part_search_state);
 
   // Partition block source pixel variance.
   unsigned int pb_source_variance = UINT_MAX;
 
   // PARTITION_NONE search stage.
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (cpi->sf.part_sf.enable_fast_erp && !frame_is_intra_only(cm)) {
+    const SimpleMotionData *whole =
+        av1_get_sms_data(cpi, tile_info, x, mi_row, mi_col, bsize);
+    part_search_state.none_data.sms_data[0] = whole;
+    part_search_state.none_data.num_sub_parts = 1;
+    part_search_state.none_data.part_rate =
+        part_search_state.partition_cost[PARTITION_NONE];
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   int64_t part_none_rd = INT64_MAX;
-  none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
-                        &part_search_state, &best_rdc, &pb_source_variance,
-                        none_rd, &part_none_rd);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (IS_FORCED_PARTITION_TYPE(PARTITION_NONE)) {
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+                          &part_search_state, &best_rdc, &pb_source_variance,
+                          none_rd, &part_none_rd);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
   // PARTITION_SPLIT search stage.
   int64_t part_split_rd = INT64_MAX;
   split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx,
@@ -3424,14 +4898,21 @@
   // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT.
   prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc,
                                part_none_rd, part_split_rd);
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 
   // Rectangular partitions search stage.
   rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
                                &part_search_state, &best_rdc,
+#if CONFIG_EXT_RECUR_PARTITIONS
+                               multi_pass_mode,
+#if CONFIG_SDP
+                               ptree_luma, template_tree,
+#endif  // CONFIG_SDP
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
                                rect_part_win_info);
 
   if (pb_source_variance == UINT_MAX) {
-    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, NULL);
     if (is_cur_buf_hbd(xd)) {
       pb_source_variance = av1_high_get_sby_perpixel_variance(
           cpi, &x->plane[0].src, bsize, xd->bd);
@@ -3444,6 +4925,7 @@
   assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
                  !part_search_state.do_rectangular_split));
 
+#if !CONFIG_EXT_RECUR_PARTITIONS
 #if CONFIG_SDP
   const int ext_partition_allowed =
       part_search_state.do_rectangular_split &&
@@ -3454,7 +4936,7 @@
       part_search_state.do_rectangular_split &&
       bsize > cpi->sf.part_sf.ext_partition_eval_thresh &&
       blk_params.has_rows && blk_params.has_cols;
-#endif
+#endif  // CONFIG_SDP
 
   // AB partitions search stage.
   ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
@@ -3464,18 +4946,13 @@
   // 4-way partitions search stage.
   int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
 
-  // Disable 4-way partition search flags for width less than twice the minimum
-  // width.
-#if CONFIG_SDP
+  // Disable 4-way partition search flags for width less than twice the
+  // minimum width.
 #if CONFIG_SDP
   if (blk_params.width < (blk_params.min_partition_size_1d << 2) ||
       (xd->tree_type == CHROMA_PART && bsize <= BLOCK_16X16) ||
       (luma_split_flag > 3)) {
 #else
-  if (blk_params.width < (blk_params.min_partition_size_1d << 2) ||
-      (xd->tree_type == CHROMA_PART && bsize <= BLOCK_16X16)) {
-#endif
-#else
   if (blk_params.width < (blk_params.min_partition_size_1d << 2)) {
 #endif
     part4_search_allowed[HORZ4] = 0;
@@ -3516,6 +4993,69 @@
                        pc_tree->vertical4, &part_search_state, &best_rdc,
                        inc_step, PARTITION_VERT_4);
   }
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int ext_partition_allowed =
+      (blk_params.has_rows && blk_params.has_cols) || !is_square_block(bsize);
+  const int partition_3_allowed =
+      ext_partition_allowed && bsize != BLOCK_128X128;
+  const int is_wide_block = block_size_wide[bsize] > block_size_high[bsize];
+  const int is_tall_block = block_size_wide[bsize] < block_size_high[bsize];
+  const int horz_3_allowed =
+      partition_3_allowed && (is_square_block(bsize) || is_tall_block) &&
+#if CONFIG_SDP
+      horz_3_allowed_sdp &&
+#endif  // CONFIG_SDP
+      check_is_chroma_size_valid(PARTITION_HORZ_3, bsize, mi_row, mi_col,
+                                 part_search_state.ss_x, part_search_state.ss_y,
+                                 pc_tree) &&
+      is_bsize_geq(get_partition_subsize(bsize, PARTITION_HORZ_3),
+                   blk_params.min_partition_size) &&
+      IMPLIES(cpi->sf.part_sf.prune_part_3_with_part_none,
+              frame_is_intra_only(cm) || forced_partition == PARTITION_HORZ_3 ||
+                  pc_tree->partitioning != PARTITION_NONE);
+
+  const int vert_3_allowed =
+      partition_3_allowed && (is_square_block(bsize) || is_wide_block) &&
+#if CONFIG_SDP
+      vert_3_allowed_sdp &&
+#endif  // CONFIG_SDP
+      check_is_chroma_size_valid(PARTITION_VERT_3, bsize, mi_row, mi_col,
+                                 part_search_state.ss_x, part_search_state.ss_y,
+                                 pc_tree) &&
+      is_bsize_geq(get_partition_subsize(bsize, PARTITION_VERT_3),
+                   blk_params.min_partition_size) &&
+      IMPLIES(cpi->sf.part_sf.prune_part_3_with_part_none,
+              frame_is_intra_only(cm) || forced_partition == PARTITION_VERT_3 ||
+                  pc_tree->partitioning != PARTITION_NONE);
+
+  // PARTITION_HORZ_3
+  if (IS_FORCED_PARTITION_TYPE(PARTITION_HORZ_3) && horz_3_allowed) {
+    search_partition_horz_3(
+        &part_search_state, cpi, td, tile_data, tp, &best_rdc, pc_tree,
+#if CONFIG_SDP
+        (ptree_luma && ptree_luma->partition == PARTITION_HORZ_3) ? ptree_luma
+                                                                  : NULL,
+
+        template_tree,
+#endif  // CONFIG_SDP
+        &x_ctx, multi_pass_mode);
+  }
+
+  // PARTITION_VERT_3
+  if (IS_FORCED_PARTITION_TYPE(PARTITION_VERT_3) && vert_3_allowed) {
+    search_partition_vert_3(
+        &part_search_state, cpi, td, tile_data, tp, &best_rdc, pc_tree,
+#if CONFIG_SDP
+        (ptree_luma && ptree_luma->partition == PARTITION_VERT_3) ? ptree_luma
+                                                                  : NULL,
+
+        template_tree,
+#endif  // CONFIG_SDP
+        &x_ctx, multi_pass_mode);
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   if (bsize == cm->seq_params.sb_size &&
       !part_search_state.found_best_partition) {
@@ -3530,10 +5070,22 @@
 
   // Store the final rd cost
   *rd_cost = best_rdc;
+  pc_tree->rd_cost = best_rdc;
+  if (!part_search_state.found_best_partition) {
+    av1_invalid_rd_stats(&pc_tree->rd_cost);
+  } else {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    av1_cache_best_partition(x->sms_bufs, mi_row, mi_col, bsize,
+                             cm->seq_params.sb_size, pc_tree->partitioning);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+  }
 
   // Also record the best partition in simple motion data tree because it is
   // necessary for the related speed features.
-  sms_tree->partitioning = pc_tree->partitioning;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  if (sms_tree)
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    sms_tree->partitioning = pc_tree->partitioning;
 
 #if CONFIG_SDP
   if (luma_split_flag > 3) {
@@ -3566,8 +5118,9 @@
 #endif
 
 #if CONFIG_COLLECT_PARTITION_STATS == 2
-  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
-  // the whole clip. So we need to pass the information upstream to the encoder.
+  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats
+  // for the whole clip. So we need to pass the information upstream to the
+  // encoder.
   const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
   int *agg_attempts = part_stats->partition_attempts[bsize_idx];
   int *agg_decisions = part_stats->partition_decisions[bsize_idx];
@@ -3585,31 +5138,57 @@
   // If a valid partition is found and reconstruction is required for future
   // sub-blocks in the same group.
   if (part_search_state.found_best_partition && pc_tree->index != 3) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    assert(pc_tree->partitioning != PARTITION_SPLIT);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     if (bsize == cm->seq_params.sb_size) {
       // Encode the superblock.
       const int emit_output = multi_pass_mode != SB_DRY_PASS;
       const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
 #if CONFIG_SDP
-      x->cb_offset[xd->tree_type == CHROMA_PART] = 0;
-#else
-      x->cb_offset = 0;
-#endif
+      const int plane_start = (xd->tree_type == CHROMA_PART);
+      const int plane_end = (xd->tree_type == LUMA_PART) ? 1 : num_planes;
+      for (int plane = plane_start; plane < plane_end; plane++) {
+        x->cb_offset[plane] = 0;
+      }
+      av1_reset_ptree_in_sbi(xd->sbi, xd->tree_type);
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
-                pc_tree, NULL);
-      // Dealloc the whole PC_TREE after a superblock is done.
+                pc_tree, xd->sbi->ptree_root[av1_get_sdp_idx(xd->tree_type)],
+#if CONFIG_EXT_RECUR_PARTITIONS
+                xd->tree_type == CHROMA_PART ? xd->sbi->ptree_root[0] : NULL,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+                NULL);
+#else
+      memset(x->cb_offset, 0, sizeof(x->cb_offset));
+      av1_reset_ptree_in_sbi(xd->sbi);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
+                pc_tree, xd->sbi->ptree_root, NULL);
+#endif  // CONFIG_SDP
+        // Dealloc the whole PC_TREE after a superblock is done.
       av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0);
       pc_tree_dealloc = 1;
     } else {
       // Encode the smaller blocks in DRY_RUN mode.
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-                pc_tree, NULL);
+                pc_tree, NULL,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                NULL,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                NULL);
     }
   }
 
+  int keep_tree = 0;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  keep_tree = should_reuse_mode(x, REUSE_INTER_MODE_IN_INTERFRAME_FLAG |
+                                       REUSE_INTRA_MODE_IN_INTERFRAME_FLAG);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
   // If the tree still exists (non-superblock), dealloc most nodes, only keep
   // nodes for the best partition and PARTITION_NONE.
-  if (pc_tree_dealloc == 0)
+  if (!pc_tree_dealloc && !keep_tree) {
     av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1);
+  }
 
   if (bsize == cm->seq_params.sb_size) {
     assert(best_rdc.rate < INT_MAX);
diff --git a/av1/encoder/partition_search.h b/av1/encoder/partition_search.h
index 9d1aa2d..8f5c342a 100644
--- a/av1/encoder/partition_search.h
+++ b/av1/encoder/partition_search.h
@@ -15,25 +15,41 @@
 #include "av1/encoder/block.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/tokenize.h"
 
 void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
                                         const TileInfo *const tile,
                                         MACROBLOCK *const x, int mi_row,
-                                        int mi_col, BLOCK_SIZE bsize);
+                                        int mi_col, BLOCK_SIZE bsize,
+                                        const CHROMA_REF_INFO *chr_ref_info);
 void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
                      MACROBLOCK *const x, int mi_row, int mi_col,
-                     BLOCK_SIZE bsize);
+                     BLOCK_SIZE bsize, const CHROMA_REF_INFO *chr_ref_info);
 void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
                           MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
                           int mi_col, BLOCK_SIZE bsize, int *rate,
-                          int64_t *dist, int do_recon, PC_TREE *pc_tree);
+                          int64_t *dist, int do_recon, PARTITION_TREE *ptree,
+                          PC_TREE *pc_tree);
 bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
                            TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
                            int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
                            RD_STATS best_rdc, PC_TREE *pc_tree,
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+                           const PARTITION_TREE *ptree_luma,
+                           const PARTITION_TREE *template_tree,
+#endif  // CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
                            SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
                            SB_MULTI_PASS_MODE multi_pass_mode,
                            RD_RECT_PART_WIN_INFO *rect_part_win_info);
+#if CONFIG_EXT_RECUR_PARTITIONS
+void av1_build_partition_tree_fixed_partitioning(AV1_COMMON *const cm,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize,
+                                                 PARTITION_TREE *ptree);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                        int mi_row, int mi_col, BLOCK_SIZE bsize,
+                        AQ_MODE aq_mode, MB_MODE_INFO *mbmi);
 
 #endif  // AOM_AV1_ENCODER_PARTITION_SEARCH_H_
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 7499e87..bd36b38 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -24,8 +24,12 @@
 #include "av1/encoder/encoder.h"
 
 #include "av1/encoder/motion_search_facade.h"
-#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/partition_search.h"
 #include "av1/encoder/rdopt.h"
+#if CONFIG_EXT_RECUR_PARTITIONS
+#include "av1/common/idct.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
 static AOM_INLINE void simple_motion_search_prune_part_features(
     AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
@@ -834,9 +838,11 @@
       get_min_bsize(sms_tree->split[i], min_bw, min_bh);
     }
   } else {
+#if !CONFIG_EXT_RECUR_PARTITIONS
     if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B ||
         part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B)
       part_type = PARTITION_SPLIT;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
     const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type);
     if (subsize != BLOCK_INVALID) {
       *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]);
@@ -1192,10 +1198,15 @@
   unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
   unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
   {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_3);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_3);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
     BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
-                         av1_num_planes(&cpi->common), bsize);
+                         av1_num_planes(&cpi->common), NULL);
     const int src_stride = x->plane[0].src.stride;
     uint8_t *src = x->plane[0].src.buf;
     const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1410,12 +1421,6 @@
   }
 }
 
-#ifndef NDEBUG
-static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) {
-  return block_size_wide[bsize] == block_size_high[bsize];
-}
-#endif  // NDEBUG
-
 void av1_prune_partitions_by_max_min_bsize(
     SuperBlockEnc *sb_enc, BLOCK_SIZE bsize, int is_not_edge_block,
     int *partition_none_allowed, int *partition_horz_allowed,
@@ -1423,19 +1428,38 @@
   assert(is_bsize_square(sb_enc->max_partition_size));
   assert(is_bsize_square(sb_enc->min_partition_size));
   assert(sb_enc->min_partition_size <= sb_enc->max_partition_size);
+#if !CONFIG_EXT_RECUR_PARTITIONS
   assert(is_bsize_square(bsize));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size];
   const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size];
   const int bsize_1d = block_size_wide[bsize];
   assert(min_partition_size_1d <= max_partition_size_1d);
   const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int block_height = block_size_high[bsize];
+  const int block_width = block_size_wide[bsize];
+  const int is_gt_max_sq_part = (block_height > max_partition_size_1d) ||
+                                (block_width > max_partition_size_1d);
+#else   // CONFIG_EXT_RECUR_PARTITIONS
   const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+  (void)do_square_split;
+  (void)is_not_edge_block;
+#endif
   if (is_gt_max_sq_part) {
     // If current block size is larger than max, only allow split.
     *partition_none_allowed = 0;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    *partition_horz_allowed = 1;
+    *partition_vert_allowed = 1;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     *partition_horz_allowed = 0;
     *partition_vert_allowed = 0;
     *do_square_split = 1;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   } else if (is_le_min_sq_part) {
     // If current block size is less or equal to min, only allow none if valid
     // block large enough; only allow split otherwise.
@@ -1443,8 +1467,12 @@
     *partition_vert_allowed = 0;
     // only disable square split when current block is not at the picture
     // boundary. otherwise, inherit the square split flag from previous logic
+#if CONFIG_EXT_RECUR_PARTITIONS
+    *partition_none_allowed = 1;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     if (is_not_edge_block) *do_square_split = 0;
     *partition_none_allowed = !(*do_square_split);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   }
 }
 
@@ -1626,3 +1654,378 @@
         pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
   }
 }
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+// Gets the number of sms data in a single dimension
+static INLINE int get_sms_count_from_length(int mi_length) {
+  switch (mi_length) {
+    case 32: return BLOCK_128_COUNT;
+    case 16: return BLOCK_64_COUNT;
+    case 8: return BLOCK_32_COUNT;
+    case 4: return BLOCK_16_COUNT;
+    case 2: return BLOCK_8_COUNT;
+    case 1: return BLOCK_4_COUNT;
+    default: assert(0 && "Invalid mi_width"); return -1;
+  }
+}
+
+// Gets the linear index corresponds to the current block.
+static INLINE int get_sms_arr_1d_idx(int mi_bsize, int mi_in_sb) {
+  int idx = -1;
+  if (mi_bsize == 1) {
+    idx = mi_in_sb;
+  } else {
+    assert(mi_in_sb % (mi_bsize / 2) == 0);
+    idx = mi_in_sb / (mi_bsize / 2);
+  }
+  assert(idx >= 0 && idx < get_sms_count_from_length(mi_bsize));
+
+  return idx;
+}
+
+#define MAKE_SMS_ARR_SWITCH_CASE(width, height) \
+  case BLOCK_##width##X##height: {              \
+    return sms_bufs->b_##width##x##height;      \
+  }
+
+// Returns the buffer in SimpleMotionDataBufs that correspond to bsize.
+static INLINE SimpleMotionData *get_sms_arr(SimpleMotionDataBufs *sms_bufs,
+                                            BLOCK_SIZE bsize) {
+  switch (bsize) {
+    // Square blocks
+    MAKE_SMS_ARR_SWITCH_CASE(128, 128);
+    MAKE_SMS_ARR_SWITCH_CASE(64, 64);
+    MAKE_SMS_ARR_SWITCH_CASE(32, 32);
+    MAKE_SMS_ARR_SWITCH_CASE(16, 16);
+    MAKE_SMS_ARR_SWITCH_CASE(8, 8);
+    MAKE_SMS_ARR_SWITCH_CASE(4, 4);
+
+    // 1:2 blocks
+    MAKE_SMS_ARR_SWITCH_CASE(64, 128);
+    MAKE_SMS_ARR_SWITCH_CASE(32, 64);
+    MAKE_SMS_ARR_SWITCH_CASE(16, 32);
+    MAKE_SMS_ARR_SWITCH_CASE(8, 16);
+    MAKE_SMS_ARR_SWITCH_CASE(4, 8);
+
+    // 2:1 blocks
+    MAKE_SMS_ARR_SWITCH_CASE(128, 64);
+    MAKE_SMS_ARR_SWITCH_CASE(64, 32);
+    MAKE_SMS_ARR_SWITCH_CASE(32, 16);
+    MAKE_SMS_ARR_SWITCH_CASE(16, 8);
+    MAKE_SMS_ARR_SWITCH_CASE(8, 4);
+
+    // 1:4 blocks
+    MAKE_SMS_ARR_SWITCH_CASE(16, 64);
+    MAKE_SMS_ARR_SWITCH_CASE(8, 32);
+    MAKE_SMS_ARR_SWITCH_CASE(4, 16);
+
+    // 4:1 blocks
+    MAKE_SMS_ARR_SWITCH_CASE(64, 16);
+    MAKE_SMS_ARR_SWITCH_CASE(32, 8);
+    MAKE_SMS_ARR_SWITCH_CASE(16, 4);
+
+    default: assert(0 && "Invalid bsize"); return NULL;
+  }
+}
+#undef MAKE_SMS_ARR_SWITCH_CASE
+
+// Retrieves the SimpleMotionData from SimpleMotionDataBufs
+SimpleMotionData *av1_get_sms_data_entry(SimpleMotionDataBufs *sms_bufs,
+                                         int mi_row, int mi_col,
+                                         BLOCK_SIZE bsize, BLOCK_SIZE sb_size) {
+  assert(mi_size_high[sb_size] == mi_size_wide[sb_size]);
+  const int mi_in_sb = mi_size_high[sb_size];
+  const int mi_row_in_sb = mi_row % mi_in_sb;
+  const int mi_col_in_sb = mi_col % mi_in_sb;
+  const int mi_high = mi_size_high[bsize];
+  const int mi_wide = mi_size_wide[bsize];
+  const int idx_row_in_sb = get_sms_arr_1d_idx(mi_high, mi_row_in_sb);
+  const int idx_col_in_sb = get_sms_arr_1d_idx(mi_wide, mi_col_in_sb);
+  const int arr_stride = get_sms_count_from_length(mi_wide);
+  SimpleMotionData *sms_arr = get_sms_arr(sms_bufs, bsize);
+  return &sms_arr[idx_row_in_sb * arr_stride + idx_col_in_sb];
+}
+
+void av1_cache_best_partition(SimpleMotionDataBufs *sms_bufs, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, BLOCK_SIZE sb_size,
+                              PARTITION_TYPE partition) {
+  SimpleMotionData *cur_block =
+      av1_get_sms_data_entry(sms_bufs, mi_row, mi_col, bsize, sb_size);
+  cur_block->has_prev_partition = 1;
+  cur_block->prev_partition = partition;
+}
+
+// Performs a simple motion search and store the result in sms_data.
+static void compute_sms_data(AV1_COMP *const cpi, const TileInfo *const tile,
+                             MACROBLOCK *x, SimpleMotionData *sms_data,
+                             int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int ref_frame =
+      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+  if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) {
+    // If the whole block is outside of the image, set the var and sse to 0.
+    sms_data->sse = 0;
+    sms_data->var = 0;
+    sms_data->dist = 0;
+    sms_data->rate = 0;
+    sms_data->rdcost = 0;
+    sms_data->valid = 1;
+    return;
+  }
+  av1_set_offsets(cpi, tile, x, mi_row, mi_col, bsize, NULL);
+  // We need to update the rd-mult here to in case we are doing simple motion
+  // search on a subblock of the current coding block.
+  const int orig_rdmult = x->rdmult;
+  const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  av1_set_error_per_bit(&x->mv_costs, x->rdmult);
+  if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const uint8_t *src_buf = x->plane[0].src.buf;
+    const uint8_t *dst_buf = xd->plane[0].dst.buf;
+    const int src_stride = x->plane[0].src.stride;
+    const int dst_stride = xd->plane[0].dst.stride;
+    if (sms_data->num_start_mvs == 0) {
+      sms_data->start_mv_list[sms_data->num_start_mvs++] = kZeroMv;
+    }
+    sms_data->rdcost = INT64_MAX;
+    SimpleMotionData best_data = *sms_data;
+    for (int idx = 0; idx < sms_data->num_start_mvs; idx++) {
+      const MV start_mv = sms_data->start_mv_list[idx];
+      const FULLPEL_MV start_mv_full = get_fullmv_from_mv(&start_mv);
+      av1_simple_motion_search_ext(cpi, tile, x, mi_row, mi_col, bsize,
+                                   ref_frame, start_mv_full, 1, 1, sms_data);
+      sms_data->var = cpi->fn_ptr[bsize].vf(src_buf, src_stride, dst_buf,
+                                            dst_stride, &sms_data->sse);
+      sms_data->dist = 16 * sms_data->sse;
+      sms_data->rate = 0;
+      sms_data->rdcost = RDCOST(x->rdmult, sms_data->rate, sms_data->dist);
+      if (sms_data->rdcost <= best_data.rdcost) {
+        best_data = *sms_data;
+      }
+    }
+    *sms_data = best_data;
+  }
+  sms_data->valid = 1;
+  sms_data->bsize = bsize;
+  sms_data->mi_row = mi_row;
+  sms_data->mi_col = mi_col;
+  x->rdmult = orig_rdmult;
+  return;
+}
+
+#if CONFIG_DEBUG
+static INLINE void print_sms(const SimpleMotionData *sms_data, char *prefix) {
+  BLOCK_SIZE bsize = sms_data->bsize;
+  MV fullmv = sms_data->fullmv;
+  MV submv = sms_data->submv;
+  printf("%s:: bsize: (%d, %d), mi_row: %d, mi_col: %d, rd: %ld\n", prefix,
+         block_size_wide[bsize], block_size_high[bsize], sms_data->mi_row,
+         sms_data->mi_col, sms_data->rdcost);
+  printf("%s:: fullmv: (%d, %d), submv: (%d, %d),\n", prefix, fullmv.row,
+         fullmv.col, submv.row, submv.col);
+  printf("%s:: mv_cost_type: %d, sadpb: %d, errpb: %d\n", prefix,
+         sms_data->mv_cost_type, sms_data->sadpb, sms_data->errorperbit);
+}
+#endif
+
+static INLINE void add_start_mv_to_block(SimpleMotionData *block, MV start_mv) {
+  if (block->num_start_mvs == kSMSMaxStartMVs) {
+    return;
+  }
+  for (int idx = 0; idx < block->num_start_mvs; idx++) {
+    const int_mv *cur_mv = (int_mv *)&block->start_mv_list[idx];
+    if (((int_mv *)&start_mv)->as_int == cur_mv->as_int) {
+      return;
+    }
+  }
+  block->start_mv_list[block->num_start_mvs++] = start_mv;
+}
+
+static INLINE void add_start_mv_to_partition(
+    SimpleMotionDataBufs *sms_bufs, int mi_row, int mi_col, BLOCK_SIZE bsize,
+    BLOCK_SIZE sb_size, PARTITION_TYPE partition, MV start_mv) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int quarter_step_h = block_size_high[bsize] / 4;
+  const int quarter_step_w = block_size_wide[bsize] / 4;
+  static const int subblock_count[EXT_PARTITION_TYPES] = {
+    1,  // PARTITION_NONE
+    2,  // PARTITION_HORZ
+    2,  // PARTITION_VERT
+    3,  // PARTITION_HORZ_3
+    3,  // PARTITION_VERT_3
+  };
+  // PARTITION x NUM_SUBBLOCKS x (ROW and COL)
+  static const int step_multiplier[EXT_PARTITION_TYPES][3][2] = {
+    { { 0, 0 }, { 0, 0 }, { 0, 0 } },  // PARTITION_NONE
+    { { 0, 0 }, { 2, 0 }, { 0, 0 } },  // PARTITION_HORZ
+    { { 0, 0 }, { 0, 2 }, { 0, 0 } },  // PARTITION_VERT
+    { { 0, 0 }, { 1, 0 }, { 3, 0 } },  // PARTITION_HORZ_3
+    { { 0, 0 }, { 0, 1 }, { 0, 3 } },  // PARTITION_VERT_3
+  };
+  for (int idx = 0; idx < subblock_count[partition]; idx++) {
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+    if (subsize == BLOCK_INVALID) {
+      return;
+    } else if (partition == PARTITION_HORZ_3 && idx == 1) {
+      subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+    } else if (partition == PARTITION_VERT_3 && idx == 1) {
+      subsize = get_partition_subsize(bsize, PARTITION_VERT);
+    }
+    const int sub_row =
+        mi_row + step_multiplier[partition][idx][0] * quarter_step_h / 4;
+    const int sub_col =
+        mi_col + step_multiplier[partition][idx][1] * quarter_step_w / 4;
+    SimpleMotionData *subblock =
+        av1_get_sms_data_entry(sms_bufs, sub_row, sub_col, subsize, sb_size);
+    add_start_mv_to_block(subblock, start_mv);
+  }
+}
+
+// Computes and stores the simple motion search data for the block at mi_row,
+// mi_col with block size bsize.
+SimpleMotionData *av1_get_sms_data(AV1_COMP *const cpi,
+                                   const TileInfo *const tile, MACROBLOCK *x,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  SimpleMotionDataBufs *sms_bufs = x->sms_bufs;
+  SimpleMotionData *cur_block =
+      av1_get_sms_data_entry(sms_bufs, mi_row, mi_col, bsize, sb_size);
+  const int valid = cur_block->valid;
+  if (!valid) {
+    compute_sms_data(cpi, tile, x, cur_block, mi_row, mi_col, bsize);
+    for (PARTITION_TYPE partition = PARTITION_NONE;
+         partition < EXT_PARTITION_TYPES; partition++) {
+      add_start_mv_to_partition(sms_bufs, mi_row, mi_col, bsize, sb_size,
+                                partition, cur_block->fullmv);
+    }
+  }
+  return cur_block;
+}
+
+PARTITION_TYPE av1_get_prev_partition(MACROBLOCK *x, int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize, BLOCK_SIZE sb_size) {
+  SimpleMotionDataBufs *sms_bufs = x->sms_bufs;
+  const SimpleMotionData *cur_block =
+      av1_get_sms_data_entry(sms_bufs, mi_row, mi_col, bsize, sb_size);
+  if (cur_block->has_prev_partition) {
+    return cur_block->prev_partition;
+  } else {
+    return PARTITION_INVALID;
+  }
+}
+
+static INLINE void gather_part_rd_stats(RD_STATS *rd_stats,
+                                        const SMSPartitionStats *stat,
+                                        int rdmult) {
+  av1_init_rd_stats(rd_stats);
+  if (stat->part_rate < INT_MAX) {
+    // rd_stats->rate += part_rate;
+  } else {
+    rd_stats->rate = INT_MAX;
+    rd_stats->rdcost = INT64_MAX;
+    return;
+  }
+  for (int idx = 0; idx < stat->num_sub_parts; idx++) {
+    rd_stats->rate += stat->sms_data[idx]->rate;
+    rd_stats->dist += stat->sms_data[idx]->dist;
+  }
+  rd_stats->rdcost = RDCOST(rdmult, rd_stats->rate, rd_stats->dist);
+}
+
+/*! \brief Checks if the average linear dimension of bsize is greater than or
+ * equal to dim. */
+static INLINE int is_avg_dim_greater_than(BLOCK_SIZE bsize, int dim) {
+  if (bsize == BLOCK_INVALID) {
+    return 0;
+  }
+  const int avg_dim = (block_size_wide[bsize] + block_size_high[bsize]) / 2;
+  return avg_dim > dim;
+}
+
+int av1_prune_new_part(const SMSPartitionStats *old_part,
+                       const SMSPartitionStats *new_part, int rdmult,
+                       BLOCK_SIZE bsize, const SPEED_FEATURES *sf) {
+  RD_STATS old_rd_stat, new_rd_stat;
+  gather_part_rd_stats(&old_rd_stat, old_part, rdmult);
+  gather_part_rd_stats(&new_rd_stat, new_part, rdmult);
+  if (sf->part_sf.enable_fast_erp < 2 && is_avg_dim_greater_than(bsize, 32)) {
+    return old_rd_stat.rdcost < new_rd_stat.rdcost;
+  }
+  return old_rd_stat.rdcost < (int)(1.001 * new_rd_stat.rdcost);
+}
+
+bool av1_prune_part_hv_with_sms(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                                MACROBLOCK *x,
+                                const PartitionSearchState *part_search_state,
+                                const RD_STATS *best_rdc,
+                                const PartitionBlkParams *blk_params,
+                                RECT_PART_TYPE rect_type, int part_rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int blk_offsets[NUM_RECT_PARTS][2] = { // HORZ
+                                               { blk_params->mi_step_h, 0 },
+                                               // VERT
+                                               { 0, blk_params->mi_step_w }
+  };
+
+  SMSPartitionStats part_data;
+  const SimpleMotionData *blk1 =
+      av1_get_sms_data(cpi, &tile_data->tile_info, x, blk_params->mi_row,
+                       blk_params->mi_col, blk_params->subsize);
+  const SimpleMotionData *blk2 = av1_get_sms_data(
+      cpi, &tile_data->tile_info, x,
+      blk_params->mi_row + blk_offsets[rect_type][0],
+      blk_params->mi_col + blk_offsets[rect_type][1], blk_params->subsize);
+  part_data.sms_data[0] = blk1;
+  part_data.sms_data[1] = blk2;
+  part_data.num_sub_parts = 2;
+  part_data.part_rate = part_rate;
+
+  if (best_rdc->rdcost < INT64_MAX &&
+      (blk_params->mi_row + 2 * blk_params->mi_step_h <=
+       cm->mi_params.mi_rows) &&
+      (blk_params->mi_col + 2 * blk_params->mi_step_w <=
+       cm->mi_params.mi_cols) &&
+      av1_prune_new_part(&part_search_state->none_data, &part_data, x->rdmult,
+                         blk_params->bsize, &cpi->sf)) {
+    const PARTITION_TYPE second_level_part =
+        (rect_type == HORZ) ? PARTITION_VERT : PARTITION_HORZ;
+    const BLOCK_SIZE subsubsize =
+        get_partition_subsize(blk_params->subsize, second_level_part);
+    if (subsubsize == BLOCK_INVALID) {
+      return true;
+    }
+
+    // Do one more check to deal with recursion
+    SMSPartitionStats subpart_data;
+    const SimpleMotionData *upleft =
+        av1_get_sms_data(cpi, &tile_data->tile_info, x, blk_params->mi_row,
+                         blk_params->mi_col, subsubsize);
+    const SimpleMotionData *upright = av1_get_sms_data(
+        cpi, &tile_data->tile_info, x, blk_params->mi_row,
+        blk_params->mi_col + blk_params->mi_step_w, subsubsize);
+    const SimpleMotionData *downleft =
+        av1_get_sms_data(cpi, &tile_data->tile_info, x,
+                         blk_params->mi_row + blk_params->mi_step_h,
+                         blk_params->mi_col, subsubsize);
+    const SimpleMotionData *downright = av1_get_sms_data(
+        cpi, &tile_data->tile_info, x,
+        blk_params->mi_row + blk_params->mi_step_h,
+        blk_params->mi_col + blk_params->mi_step_w, subsubsize);
+    subpart_data.sms_data[0] = upleft;
+    subpart_data.sms_data[1] = upright;
+    subpart_data.sms_data[2] = downleft;
+    subpart_data.sms_data[3] = downright;
+    subpart_data.num_sub_parts = 4;
+    subpart_data.part_rate = 0;
+    if (av1_prune_new_part(&part_search_state->none_data, &subpart_data,
+                           x->rdmult, blk_params->bsize, &cpi->sf)) {
+      return true;
+    }
+  }
+  return false;
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
diff --git a/av1/encoder/partition_strategy.h b/av1/encoder/partition_strategy.h
index a386bf9..1587b67 100644
--- a/av1/encoder/partition_strategy.h
+++ b/av1/encoder/partition_strategy.h
@@ -12,6 +12,7 @@
 #ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
 #define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
 
+#include "av1/encoder/block.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encoder.h"
@@ -191,6 +192,67 @@
     int *horza_partition_allowed, int *horzb_partition_allowed,
     int *verta_partition_allowed, int *vertb_partition_allowed);
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+SimpleMotionData *av1_get_sms_data_entry(SimpleMotionDataBufs *sms_bufs,
+                                         int mi_row, int mi_col,
+                                         BLOCK_SIZE bsize, BLOCK_SIZE sb_size);
+SimpleMotionData *av1_get_sms_data(AV1_COMP *const cpi,
+                                   const TileInfo *const tile, MACROBLOCK *x,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize);
+
+static AOM_INLINE void av1_add_mode_search_context_to_cache(
+    SimpleMotionData *sms_data, PICK_MODE_CONTEXT *ctx) {
+  if (!sms_data->mode_cache[0] ||
+      sms_data->mode_cache[0]->rd_stats.rdcost > ctx->rd_stats.rdcost) {
+    sms_data->mode_cache[0] = ctx;
+  }
+}
+
+static INLINE void av1_set_best_mode_cache(MACROBLOCK *x,
+                                           PICK_MODE_CONTEXT *mode_cache[1]) {
+  if (mode_cache[0] && mode_cache[0]->rd_stats.rate != INT_MAX) {
+    x->inter_mode_cache = &mode_cache[0]->mic;
+  } else {
+    x->inter_mode_cache = NULL;
+  }
+}
+
+typedef struct SMSPartitionStats {
+  const SimpleMotionData *sms_data[4];
+  int num_sub_parts;
+  int part_rate;
+} SMSPartitionStats;
+
+static INLINE void av1_init_sms_partition_stats(SMSPartitionStats *stats) {
+  memset(stats->sms_data, 0, sizeof(stats->sms_data));
+  stats->num_sub_parts = 0;
+  stats->part_rate = INT_MAX;
+}
+
+// Returns 1 if we think the old part is better and we should prune new
+// partition, 0 otherwise.
+int av1_prune_new_part(const SMSPartitionStats *old_part,
+                       const SMSPartitionStats *new_part, int rdmult,
+                       BLOCK_SIZE bsize, const SPEED_FEATURES *sf);
+
+void av1_cache_best_partition(SimpleMotionDataBufs *sms_bufs, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, BLOCK_SIZE sb_size,
+                              PARTITION_TYPE partition);
+
+void av1_copy_sms_part(const SimpleMotionData **part_dst, int *part_size_dst,
+                       int *part_rate_dst,
+                       const SimpleMotionData *const *part_src,
+                       int part_size_src, int part_rate_src);
+
+struct PartitionBlkParams;
+struct PartitionSearchState;
+bool av1_prune_part_hv_with_sms(
+    AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+    const struct PartitionSearchState *part_search_state,
+    const RD_STATS *best_rdc, const struct PartitionBlkParams *blk_params,
+    RECT_PART_TYPE rect_type, int part_rate);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 // A simplified version of set_offsets meant to be used for
 // simple_motion_search.
 static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
@@ -208,15 +270,15 @@
                         mi_row, mi_col);
 
   // Set up destination pointers.
-  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
-                       num_planes);
+  av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes, NULL);
 
   // Set up limit values for MV components.
   // Mv beyond the range do not produce new/different prediction block.
   av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
                     mi_width, cpi->oxcf.border_in_pixels);
 
-  set_plane_n4(xd, mi_width, mi_height, num_planes);
+  set_plane_n4(xd, mi_width, mi_height, num_planes, NULL);
 
   xd->mi_row = mi_row;
   xd->mi_col = mi_col;
@@ -231,7 +293,7 @@
       GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE);
 
   // Set up source buffers.
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, NULL);
 }
 
 static INLINE void init_simple_motion_search_mvs(
@@ -250,6 +312,15 @@
   }
 }
 
+PARTITION_TYPE av1_get_prev_partition(MACROBLOCK *x, int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize, BLOCK_SIZE sb_size);
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+static INLINE void av1_init_sms_data_bufs(SimpleMotionDataBufs *data_bufs) {
+  memset(data_bufs, 0, sizeof(*data_bufs));
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
                              int mi_row, int mi_col, BLOCK_SIZE sb_size) {
   const int sb_mi_wide = mi_size_wide[sb_size];
diff --git a/av1/encoder/pickccso.c b/av1/encoder/pickccso.c
index 3b90267..b744b4d 100644
--- a/av1/encoder/pickccso.c
+++ b/av1/encoder/pickccso.c
@@ -448,8 +448,8 @@
   int64_t rdmult_temp = (int64_t)rdmult * (int64_t)rdmult_weight;
   if (rdmult_temp < INT_MAX) rdmult = (int)rdmult_temp;
   const int num_planes = av1_num_planes(cm);
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, &cm->cur_frame->buf,
-                       0, 0, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, 0, 0, 0, num_planes,
+                       NULL);
   ccso_stride = xd->plane[0].dst.width;
   ccso_stride_ext = xd->plane[0].dst.width + (CCSO_PADDING_SIZE << 1);
   derive_ccso_filter(cm, AOM_PLANE_U, xd, org_uv[AOM_PLANE_U - 1], ext_rec_y,
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index e88cf87..2fb36b0 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -431,8 +431,7 @@
   const int total_strengths = nb_cdef_strengths[pick_method];
   DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
   const int num_planes = av1_num_planes(cm);
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
+  av1_setup_dst_planes(xd->plane, frame, 0, 0, 0, num_planes, NULL);
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
   mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
   mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 0e4e0a0..9da1e9e 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -114,6 +114,12 @@
                              fc->partition_cdf[i], NULL);
 #endif
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  for (i = 0; i < PARTITION_CONTEXTS_REC; ++i)
+    av1_cost_tokens_from_cdf(mode_costs->partition_rec_cost[i],
+                             fc->partition_rec_cdf[i], NULL);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
   if (cm->current_frame.skip_mode_info.skip_mode_flag) {
     for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) {
       av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i],
@@ -1269,19 +1275,11 @@
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   for (int i = 0; i < num_planes; ++i) {
-#if CONFIG_SDP
-    setup_pred_plane(dst + i, xd->mi[0]->sb_type[i > 0 ? 1 : 0], dst[i].buf,
-                     i ? src->uv_crop_width : src->y_crop_width,
-                     i ? src->uv_crop_height : src->y_crop_height,
-                     dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
-                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
-#else
-    setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
-                     i ? src->uv_crop_width : src->y_crop_width,
-                     i ? src->uv_crop_height : src->y_crop_height,
-                     dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
-                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
-#endif
+    setup_pred_plane(
+        dst + i, dst[i].buf, i ? src->uv_crop_width : src->y_crop_width,
+        i ? src->uv_crop_height : src->y_crop_height, dst[i].stride, mi_row,
+        mi_col, i ? scale_uv : scale, xd->plane[i].subsampling_x,
+        xd->plane[i].subsampling_y, &xd->mi[0]->chroma_ref_info);
   }
 }
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 5a8b113..4c39cc3 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -755,10 +755,15 @@
     if (plane && !xd->is_chroma_ref) break;
     const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_SDP
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+    const BLOCK_SIZE bs = get_mb_plane_block_size(
+        xd, mbmi, plane, pd->subsampling_x, pd->subsampling_y);
+#elif CONFIG_SDP
     const BLOCK_SIZE bs = get_plane_block_size(
         mbmi->sb_type[plane > 0], pd->subsampling_x, pd->subsampling_y);
 #else
+    // TODO(chiyotsai, yuec): This appears to be wrong when EXT_RECUR_PARTITIONS
+    // is on?
     const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
                                                pd->subsampling_y);
 #endif
@@ -3983,7 +3988,7 @@
       }
 
       if (num_planes > 1) {
-        av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+        av1_txfm_uvrd(cpi, x, &rd_stats_uv, INT64_MAX);
       } else {
         av1_init_rd_stats(&rd_stats_uv);
       }
@@ -4242,6 +4247,37 @@
   return 0;
 }
 
+static AOM_INLINE int is_ref_frame_used_by_compound_ref(
+    int ref_frame, int skip_ref_frame_mask) {
+  for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+    if (!(skip_ref_frame_mask & (1 << r))) {
+      const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+      if (rf[0] == ref_frame || rf[1] == ref_frame) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
+                                                 const MB_MODE_INFO *mi_cache) {
+  if (!mi_cache) {
+    return 0;
+  }
+
+  if (ref_frame < REF_FRAMES) {
+    return (ref_frame == mi_cache->ref_frame[0] ||
+            ref_frame == mi_cache->ref_frame[1]);
+  }
+
+  // if we are here, then the current mode is compound.
+  MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame);
+  return ref_frame == cached_ref_type;
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 // Please add/modify parameter setting in this function, making it consistent
 // and easy to read and maintain.
 static AOM_INLINE void set_params_rd_pick_inter_mode(
@@ -4271,18 +4307,14 @@
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
       if (mbmi->partition != PARTITION_NONE &&
           mbmi->partition != PARTITION_SPLIT) {
-        if (skip_ref_frame_mask & (1 << ref_frame)) {
-          int skip = 1;
-          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
-            if (!(skip_ref_frame_mask & (1 << r))) {
-              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
-              if (rf[0] == ref_frame || rf[1] == ref_frame) {
-                skip = 0;
-                break;
-              }
-            }
-          }
-          if (skip) continue;
+        if (skip_ref_frame_mask & (1 << ref_frame) &&
+            !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask)
+#if CONFIG_EXT_RECUR_PARTITIONS
+            && !(should_reuse_mode(x, REUSE_INTER_MODE_IN_INTERFRAME_FLAG) &&
+                 is_ref_frame_used_in_cache(ref_frame, x->inter_mode_cache))
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+        ) {
+          continue;
         }
       }
       assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
@@ -4308,7 +4340,12 @@
 
       if (mbmi->partition != PARTITION_NONE &&
           mbmi->partition != PARTITION_SPLIT) {
-        if (skip_ref_frame_mask & (1 << ref_frame)) {
+        if (skip_ref_frame_mask & (1 << ref_frame)
+#if CONFIG_EXT_RECUR_PARTITIONS
+            && !(should_reuse_mode(x, REUSE_INTER_MODE_IN_INTERFRAME_FLAG) &&
+                 is_ref_frame_used_in_cache(ref_frame, x->inter_mode_cache))
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+        ) {
           continue;
         }
       }
@@ -4332,7 +4369,7 @@
   if (cpi->oxcf.motion_mode_cfg.enable_obmc && !cpi->sf.inter_sf.disable_obmc &&
       !prune_obmc) {
     if (check_num_overlappable_neighbors(mbmi) &&
-        is_motion_variation_allowed_bsize(bsize)) {
+        is_motion_variation_allowed_bsize(bsize, mi_row, mi_col)) {
       int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
       int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
                                        MAX_SB_SIZE >> 1 };
@@ -4346,8 +4383,8 @@
                                          dst_width2, dst_height2,
                                          args->left_pred_stride);
       const int num_planes = av1_num_planes(cm);
-      av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
-                           mi_col, 0, num_planes);
+      av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                           num_planes, &mbmi->chroma_ref_info);
       calc_target_weighted_pred(
           cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0],
           args->left_pred_buf[0], args->left_pred_stride[0]);
@@ -4523,6 +4560,84 @@
   return picked_ref_frames_mask;
 }
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+static INLINE int is_mode_intra(PREDICTION_MODE mode) {
+  return mode < INTRA_MODE_END;
+}
+
+// Reuse the prediction mode in cache.
+// Returns 0 if no pruning is done, 1 if we are skipping the current mod
+// completely, 2 if we skip compound only, but still try single motion modes
+static INLINE int skip_inter_mode_with_cached_mode(
+    const MACROBLOCK *x, PREDICTION_MODE mode,
+    const MV_REFERENCE_FRAME *ref_frame) {
+  const MB_MODE_INFO *cached_mi = x->inter_mode_cache;
+
+  // If there is no cache, then no pruning is possible.
+  if (!cached_mi) {
+    return 0;
+  }
+
+  const PREDICTION_MODE cached_mode = cached_mi->mode;
+  const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame;
+  const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME;
+
+  // If the cached mode is intra, then we just need to match the mode.
+  if (should_reuse_mode(x, REUSE_INTRA_MODE_IN_INTERFRAME_FLAG) &&
+      is_mode_intra(cached_mode) && mode != cached_mode) {
+    return 1;
+  }
+
+  // Returns 0 here if we are not reusing inter_modes
+  if (!should_reuse_mode(x, REUSE_INTER_MODE_IN_INTERFRAME_FLAG) ||
+      !cached_mi) {
+    return 0;
+  }
+
+  // If the cached mode is single inter mode, then we match the mode and
+  // reference frame.
+  if (cached_mode_is_single) {
+    if (mode != cached_mode || ref_frame[0] != cached_frame[0]) {
+      return 1;
+    }
+  } else {
+    // If the cached mode is compound, then we need to consider several cases.
+    const int mode_is_single = ref_frame[1] <= INTRA_FRAME;
+    if (mode_is_single) {
+      // If the mode is single, we know the modes can't match. But we might
+      // still want to search it if compound mode depends on the current mode.
+      int skip_motion_mode_only = 0;
+#if CONFIG_NEW_INTER_MODES
+      if (cached_mode == NEW_NEARMV) {
+#else
+      if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) {
+#endif
+        skip_motion_mode_only = (ref_frame[0] == cached_frame[0]);
+#if CONFIG_NEW_INTER_MODES
+      } else if (cached_mode == NEAR_NEWMV) {
+#else
+      } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) {
+#endif
+        skip_motion_mode_only = (ref_frame[0] == cached_frame[1]);
+      } else if (cached_mode == NEW_NEWMV) {
+        skip_motion_mode_only = (ref_frame[0] == cached_frame[0] ||
+                                 ref_frame[0] == cached_frame[1]);
+      }
+
+      return 1 + skip_motion_mode_only;
+    } else {
+      // If both modes are compound, then everything must match.
+      if (mode != cached_mode || ref_frame[0] != cached_frame[0] ||
+          ref_frame[1] != cached_frame[1]) {
+        return 1;
+      }
+    }
+  }
+
+  return 0;
+}
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 // Case 1: return 0, means don't skip this mode
 // Case 2: return 1, means skip this mode completely
 // Case 3: return 2, means skip compound only, but still try single motion modes
@@ -4546,6 +4661,13 @@
   if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) {
     return 1;
   }
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int cached_skip_ret =
+      skip_inter_mode_with_cached_mode(x, mode, ref_frame);
+  if (cached_skip_ret > 0) {
+    return cached_skip_ret;
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
   const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
   // If no valid mode has been found so far in PARTITION_NONE when finding a
@@ -4576,6 +4698,18 @@
         }
       }
     }
+#if CONFIG_EXT_RECUR_PARTITIONS
+    // If we are reusing the prediction from cache, and the current frame is
+    // required by the cache, then we cannot prune it.
+    if (should_reuse_mode(x, REUSE_INTER_MODE_IN_INTERFRAME_FLAG) &&
+        is_ref_frame_used_in_cache(ref_type, x->inter_mode_cache)) {
+      skip_ref = 0;
+      // If the cache only needs the current reference type for compound
+      // prediction, then we can skip motion mode search.
+      skip_motion_mode = (ref_type <= ALTREF_FRAME &&
+                          x->inter_mode_cache->ref_frame[1] > INTRA_FRAME);
+    }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     if (skip_ref) return 1;
   }
 
@@ -5899,17 +6033,26 @@
 
   // Only try palette mode when the best mode so far is an intra mode.
 #if CONFIG_SDP
-  const int try_palette =
-      cpi->oxcf.tool_cfg.enable_palette &&
-      av1_allow_palette(features->allow_screen_content_tools,
-                        mbmi->sb_type[PLANE_TYPE_Y]) &&
-      !is_inter_mode(search_state.best_mbmode.mode);
+  int try_palette = cpi->oxcf.tool_cfg.enable_palette &&
+                    av1_allow_palette(features->allow_screen_content_tools,
+                                      mbmi->sb_type[PLANE_TYPE_Y]) &&
+                    !is_inter_mode(search_state.best_mbmode.mode) &&
+                    rd_cost->rate < INT_MAX;
 #else
-  const int try_palette =
+  int try_palette =
       cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(features->allow_screen_content_tools, mbmi->sb_type) &&
-      !is_inter_mode(search_state.best_mbmode.mode);
+      !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate < INT_MAX;
 #endif
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const MB_MODE_INFO *cached_mode = x->inter_mode_cache;
+  if (should_reuse_mode(x, REUSE_INTRA_MODE_IN_INTERFRAME_FLAG) &&
+      cached_mode &&
+      !(cached_mode->mode == DC_PRED &&
+        cached_mode->palette_mode_info.palette_size[0] > 0)) {
+    try_palette = 0;
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   RD_STATS this_rd_cost;
   int this_skippable = 0;
   if (try_palette) {
@@ -6110,7 +6253,8 @@
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   av1_count_overlappable_neighbors(cm, xd);
-  if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+  if (is_motion_variation_allowed_bsize(bsize, mi_row, mi_col) &&
+      !has_second_ref(mbmi)) {
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
     mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
     // Select the samples according to motion vector difference
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index 288ab4c..4514c5e 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c
@@ -43,32 +43,54 @@
   (void)mc_buf;
 
   const struct scale_factors *sf = inter_pred_params->scale_factors;
-
   struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
-  int ssx = inter_pred_params->subsampling_x;
-  int ssy = inter_pred_params->subsampling_y;
-  int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
-  orig_pos_y += src_mv->row * (1 << (1 - ssy));
-  int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
-  orig_pos_x += src_mv->col * (1 << (1 - ssx));
-  int pos_y = sf->scale_value_y(orig_pos_y, sf);
-  int pos_x = sf->scale_value_x(orig_pos_x, sf);
-  pos_x += SCALE_EXTRA_OFF;
-  pos_y += SCALE_EXTRA_OFF;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled || !xd) {
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+    int ssx = inter_pred_params->subsampling_x;
+    int ssy = inter_pred_params->subsampling_y;
+    int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+    orig_pos_y += src_mv->row * (1 << (1 - ssy));
+    int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+    orig_pos_x += src_mv->col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
 
-  const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-  const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-  const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-  const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-  pos_y = clamp(pos_y, top, bottom);
-  pos_x = clamp(pos_x, left, right);
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
 
-  subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
-  subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
-  subpel_params->xs = sf->x_step_q4;
-  subpel_params->ys = sf->y_step_q4;
-  *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-         (pos_x >> SCALE_SUBPEL_BITS);
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+    *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+           (pos_x >> SCALE_SUBPEL_BITS);
+#if CONFIG_EXT_RECUR_PARTITIONS
+  } else {
+    int pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+    int pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+    const int bw = inter_pred_params->block_width;
+    const int bh = inter_pred_params->block_height;
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, src_mv, bw, bh, inter_pred_params->subsampling_x,
+        inter_pred_params->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    pos_x += mv_q4.col;
+    pos_y += mv_q4.row;
+    *pre = pre_buf->buf0 + (pos_y >> SUBPEL_BITS) * pre_buf->stride +
+           (pos_x >> SUBPEL_BITS);
+  }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   *src_stride = pre_buf->stride;
 }
 
@@ -147,21 +169,15 @@
                                    int mi_col_offset, MB_MODE_INFO *ref_mbmi,
                                    struct build_prediction_ctxt *ctxt,
                                    const int num_planes) {
-#if CONFIG_SDP
-  const BLOCK_SIZE ref_bsize =
-      AOMMAX(BLOCK_8X8, ref_mbmi->sb_type[PLANE_TYPE_Y]);
-#else
-  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type);
-#endif
   const int ref_mi_row = xd->mi_row + mi_row_offset;
   const int ref_mi_col = xd->mi_col + mi_col_offset;
 
   for (int plane = 0; plane < num_planes; ++plane) {
     struct macroblockd_plane *const pd = &xd->plane[plane];
-    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
-                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
-                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
-                     NULL, pd->subsampling_x, pd->subsampling_y);
+    setup_pred_plane(&pd->dst, ctxt->tmp_buf[plane], ctxt->tmp_width[plane],
+                     ctxt->tmp_height[plane], ctxt->tmp_stride[plane],
+                     mi_row_offset, mi_col_offset, NULL, pd->subsampling_x,
+                     pd->subsampling_y, NULL);
   }
 
   const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
@@ -176,7 +192,7 @@
                        "Reference frame has invalid dimensions");
 
   av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
-                       num_planes);
+                       num_planes, NULL);
 }
 
 static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
@@ -291,13 +307,8 @@
                                       dst_stride1);
   av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
                                      dst_stride2);
-#if CONFIG_SDP
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type[PLANE_TYPE_Y],
-                       &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes);
-#else
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
-                       mi_row, mi_col, 0, num_planes);
-#endif
+  av1_setup_dst_planes(xd->plane, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes, &xd->mi[0]->chroma_ref_info);
   av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
                                   dst_stride2);
 }
@@ -305,7 +316,6 @@
 void av1_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
     uint8_t *ext_dst[3], int ext_dst_stride[3]) {
-  assert(bsize < BLOCK_SIZES_ALL);
   const MB_MODE_INFO *mi = xd->mi[0];
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
@@ -318,8 +328,18 @@
 
   for (int plane = plane_from; plane <= plane_to; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+    const BLOCK_SIZE plane_bsize = get_mb_plane_block_size(
+        xd, mi, plane, pd->subsampling_x, pd->subsampling_y);
+#if CONFIG_SDP
+    assert(plane_bsize ==
+           get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y));
+#endif  // CONFIG_SDP
+    (void)bsize;
+#else
     const BLOCK_SIZE plane_bsize =
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
     const int bw = block_size_wide[plane_bsize];
     const int bh = block_size_high[plane_bsize];
 
@@ -443,10 +463,21 @@
                                               uint8_t *ext_dst1[3],
                                               int ext_dst_stride1[3]) {
   int plane;
-  assert(bsize < BLOCK_SIZES_ALL);
   for (plane = plane_from; plane <= plane_to; ++plane) {
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+    const BLOCK_SIZE plane_bsize = get_mb_plane_block_size(
+        xd, xd->mi[0], plane, xd->plane[plane].subsampling_x,
+        xd->plane[plane].subsampling_y);
+#if CONFIG_SDP
+    assert(plane_bsize == get_plane_block_size(bsize,
+                                               xd->plane[plane].subsampling_x,
+                                               xd->plane[plane].subsampling_y));
+#endif  // CONFIG_SDP
+    (void)bsize;
+#else
     const BLOCK_SIZE plane_bsize = get_plane_block_size(
         bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
     const int bw = block_size_wide[plane_bsize];
     const int bh = block_size_high[plane_bsize];
     build_wedge_inter_predictor_from_buf(
diff --git a/av1/encoder/reconinter_enc.h b/av1/encoder/reconinter_enc.h
index fdc1f31..9347523 100644
--- a/av1/encoder/reconinter_enc.h
+++ b/av1/encoder/reconinter_enc.h
@@ -54,6 +54,7 @@
 
 void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
+// TODO(any): Refactor bsize out of the function signature
 void av1_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
     uint8_t *ext_dst[3], int ext_dst_stride[3]);
diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index a240b59..66b206c 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c
@@ -53,8 +53,9 @@
   if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
   xd->mi = mi;
+  assert(xd->mi && xd->mi[0]);
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
-                 mi_params->mi_cols);
+                 mi_params->mi_cols, &xd->mi[0]->chroma_ref_info);
 
   // Count the number of hits on each segment with no prediction
   const int segment_id = xd->mi[0]->segment_id;
@@ -124,6 +125,19 @@
       CSEGS(hbs, bs, 0, 0);
       CSEGS(hbs, bs, 0, hbs);
       break;
+#if CONFIG_EXT_RECUR_PARTITIONS
+    case PARTITION_HORZ_3:
+      CSEGS(bs, qbs, 0, 0);
+      CSEGS(bs, hbs, qbs, 0);
+      if (mi_row + 3 * qbs < mi_params->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
+      break;
+
+    case PARTITION_VERT_3:
+      CSEGS(qbs, bs, 0, 0);
+      CSEGS(hbs, bs, 0, qbs);
+      if (mi_col + 3 * qbs < mi_params->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
+      break;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     case PARTITION_HORZ_A:
       CSEGS(hbs, hbs, 0, 0);
       CSEGS(hbs, hbs, 0, hbs);
@@ -150,14 +164,13 @@
       CSEGS(bs, qbs, 2 * qbs, 0);
       if (mi_row + 3 * qbs < mi_params->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
       break;
-
     case PARTITION_VERT_4:
       CSEGS(qbs, bs, 0, 0);
       CSEGS(qbs, bs, 0, qbs);
       CSEGS(qbs, bs, 0, 2 * qbs);
       if (mi_col + 3 * qbs < mi_params->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
       break;
-
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     case PARTITION_SPLIT: {
       const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
       int n;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index cc42808..c47861d 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -153,7 +153,11 @@
     else
       sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
   } else {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
   }
 
@@ -181,9 +185,17 @@
     if (is_720p_or_larger) {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
     } else if (is_480p_or_larger) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
       sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     } else {
+#if CONFIG_EXT_RECUR_PARTITIONS
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
       sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     }
 
     if (!is_720p_or_larger) {
@@ -197,6 +209,9 @@
   }
 
   if (speed >= 2) {
+#if CONFIG_EXT_RECUR_PARTITIONS
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     if (is_720p_or_larger) {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
     } else if (is_480p_or_larger) {
@@ -204,6 +219,7 @@
     } else {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
     }
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 
     if (is_720p_or_larger) {
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
@@ -309,12 +325,25 @@
   sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
 
   sf->part_sf.less_rectangular_check_level = 1;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  sf->part_sf.enable_fast_erp = 0;
+  sf->part_sf.ml_prune_4_partition = 0;
+  sf->part_sf.ml_prune_ab_partition = 0;
+
+  sf->part_sf.prune_part_3_with_part_none = 1;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
   sf->part_sf.ml_prune_4_partition = 1;
   sf->part_sf.ml_prune_ab_partition = 1;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   sf->part_sf.ml_prune_rect_partition = 1;
   sf->part_sf.prune_ext_partition_types_search_level = 1;
   sf->part_sf.simple_motion_search_prune_rect = 1;
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  sf->inter_sf.reuse_erp_mode_flag =
+      (REUSE_PARTITION_MODE_FLAG | REUSE_INTER_MODE_IN_INTERFRAME_FLAG |
+       REUSE_INTRA_MODE_IN_INTERFRAME_FLAG);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   sf->inter_sf.disable_wedge_search_var_thresh = 0;
   // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_sf.inter_mode_rd_model_estimation = 1;
@@ -362,7 +391,11 @@
     sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
     sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+    sf->part_sf.intra_cnn_split = 0;
+#else   // CONFIG_EXT_RECUR_PARTITIONS
     sf->part_sf.intra_cnn_split = 1;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
     sf->part_sf.simple_motion_search_early_term_none = 1;
     // TODO(Venkat): Clean-up frame type dependency for
     // simple_motion_search_split in partition search function and set the
@@ -705,6 +738,10 @@
   part_sf->prune_4_partition_using_split_info = 0;
   part_sf->prune_ab_partition_using_split_info = 0;
   part_sf->early_term_after_none_split = 0;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  part_sf->enable_fast_erp = 0;
+  part_sf->prune_part_3_with_part_none = 1;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
@@ -771,6 +808,9 @@
   inter_sf->txfm_rd_gate_level = 0;
   inter_sf->prune_inter_modes_if_skippable = 0;
   inter_sf->disable_masked_comp = 0;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  inter_sf->reuse_erp_mode_flag = 0;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 }
 
 static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
@@ -897,7 +937,9 @@
   part_sf->ml_prune_ab_partition = 0;
   part_sf->ml_prune_rect_partition = 0;
   part_sf->ml_early_term_after_part_split_level = 0;
+#if !CONFIG_EXT_RECUR_PARTITIONS
   part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
   part_sf->intra_cnn_split = 0;
   part_sf->simple_motion_search_split = 0;
   part_sf->simple_motion_search_prune_rect = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index d52d2fc..eb7db43 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -310,6 +310,25 @@
 } UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE);
 
 /*!\endcond */
+#if CONFIG_EXT_RECUR_PARTITIONS
+/*! \brief Used with \ref MACROBLOCK::reuse_inter_mode_cache_type to determine
+ * whether partition mode is reused. */
+#define REUSE_PARTITION_MODE_FLAG (1 << 0)
+
+/*! \brief Used with \ref MACROBLOCK::reuse_inter_mode_cache_type to determine
+ * whether the intra prediction_mode is reused. */
+#define REUSE_INTRA_MODE_IN_INTERFRAME_FLAG (1 << 1)
+
+/*! \brief Used with \ref MACROBLOCK::reuse_inter_mode_cache_type to determine
+ * whether the inter prediction_mode and ref frame are reused. */
+#define REUSE_INTER_MODE_IN_INTERFRAME_FLAG (1 << 2)
+
+/*! \brief Used with \ref MACROBLOCK::reuse_inter_mode_cache_type to signal
+ * reuse of inter and intra prediction_modes, as well as ref frame. */
+#define REUSE_INTERFRAME_FLAG \
+  (REUSE_INTRA_MODE_IN_INTERFRAME_FLAG | REUSE_INTER_MODE_IN_INTERFRAME_FLAG)
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
 /*!
  * \brief Sequence/frame level speed vs quality features
  */
@@ -508,6 +527,13 @@
   // Terminate partition search for child partition,
   // when NONE and SPLIT partition rd_costs are INT64_MAX.
   int early_term_after_none_split;
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+  int enable_fast_erp;
+
+  // Prunes PARTITION_3 if PARTITION_NONE is used instead of PARTITION_HORZ|VERT
+  int prune_part_3_with_part_none;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 } PARTITION_SPEED_FEATURES;
 
 typedef struct MV_SPEED_FEATURES {
@@ -758,6 +784,13 @@
 
   // Enable/disable masked compound.
   int disable_masked_comp;
+
+#if CONFIG_EXT_RECUR_PARTITIONS
+  // Under ERP, determines whether to reuse partition mode and prediction mode
+  // if a block with the same (mi_row, mi_col, bsize) is visited more than one
+  // by the encoder.
+  int reuse_erp_mode_flag;
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
 } INTER_MODE_SPEED_FEATURES;
 
 typedef struct INTERP_FILTER_SPEED_FEATURES {
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 1613bf8..de7a189 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -141,7 +141,14 @@
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-#if CONFIG_SDP
+#if CONFIG_SDP && CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE bsize_base = get_bsize_base(xd, mbmi, plane);
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(bsize_base, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+#elif CONFIG_SDP
   const TX_SIZE plane_tx_size =
       plane ? av1_get_max_uv_txsize(mbmi->sb_type[xd->tree_type == CHROMA_PART],
                                     pd->subsampling_x, pd->subsampling_y)
@@ -149,24 +156,27 @@
                                                          blk_col)];
 #else
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
-                                    pd->subsampling_y)
+      plane ? av1_get_max_uv_txsize(mbmi->chroma_ref_info.bsize_base,
+                                    pd->subsampling_x, pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
 #endif
 
   if (tx_size == plane_tx_size || plane) {
-#if CONFIG_SDP
-    plane_bsize =
-        get_plane_block_size(mbmi->sb_type[xd->tree_type == CHROMA_PART],
-                             pd->subsampling_x, pd->subsampling_y);
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+    plane_bsize = get_mb_plane_block_size(xd, mbmi, plane, pd->subsampling_x,
+                                          pd->subsampling_y);
+#if !CONFIG_EXT_RECUR_PARTITIONS
+    assert(plane_bsize ==
+           get_plane_block_size(mbmi->sb_type[xd->tree_type == CHROMA_PART],
+                                pd->subsampling_x, pd->subsampling_y));
+#endif  // !CONFIG_EXT_RECUR_PARTITIONS
 #else
     plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
                                        pd->subsampling_y);
-#endif
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
     av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
                                       plane_bsize, tx_size, arg);
-
   } else {
 #if CONFIG_NEW_TX_PARTITION
     TX_SIZE sub_txs[MAX_TX_PARTITIONS] = { 0 };
@@ -240,8 +250,12 @@
 #if CONFIG_SDP
   if (mbmi->skip_txfm[xd->tree_type == CHROMA_PART]) {
 #else
+  assert(mbmi->sb_type < BLOCK_SIZES_ALL);
   if (mbmi->skip_txfm) {
 #endif
+#if CONFIG_SDP
+    assert(bsize == mbmi->sb_type[av1_get_sdp_idx(xd->tree_type)]);
+#endif  // CONFIG_SDP
     av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
@@ -256,7 +270,20 @@
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const int ss_x = pd->subsampling_x;
     const int ss_y = pd->subsampling_y;
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+    const BLOCK_SIZE plane_bsize =
+        get_mb_plane_block_size(xd, mbmi, plane, ss_x, ss_y);
+#if CONFIG_SDP
+    const BLOCK_SIZE bsize_base =
+        plane ? mbmi->chroma_ref_info.bsize_base : bsize;
+    assert(plane_bsize == get_plane_block_size(bsize_base, ss_x, ss_y));
+    (void)bsize_base;
+#endif  // CONFIG_SDP
+#else
+    const BLOCK_SIZE bsize_base =
+        plane ? mbmi->chroma_ref_info.bsize_base : bsize;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize_base, ss_x, ss_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
     assert(plane_bsize < BLOCK_SIZES_ALL);
     const int mi_width = mi_size_wide[plane_bsize];
     const int mi_height = mi_size_high[plane_bsize];
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 6a2e5cc..3928c62 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -275,9 +275,9 @@
   set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
                         mi_row, mi_col);
   set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
-                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
-  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
-               av1_num_planes(cm));
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols, NULL);
+  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], av1_num_planes(cm),
+               NULL);
 #if CONFIG_SDP
   xd->mi[0]->sb_type[xd->tree_type == CHROMA_PART] = bsize;
 #else
@@ -783,6 +783,12 @@
   tpl_reset_src_ref_frames(tpl_data);
   av1_tile_init(&xd->tile, cm, 0, 0);
 
+  // TODO(any): The tiles are not being set correctly by av1_tile_init above as
+  // it always assumes the first tile is used. We set the tile size here as a
+  // hack.
+  xd->tile.mi_row_end = cm->mi_params.mi_rows;
+  xd->tile.mi_col_end = cm->mi_params.mi_cols;
+
   // Setup scaling factor
   av1_setup_scale_factors_for_frame(
       &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height,
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 9b7ae7f..f109006 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/blockd.h"
 #include "av1/common/cfl.h"
 #include "av1/common/reconintra.h"
 #include "av1/encoder/block.h"
@@ -907,8 +908,13 @@
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
+    const BLOCK_SIZE bs = get_mb_plane_block_size(
+        xd, mbmi, plane, pd->subsampling_x, pd->subsampling_y);
+#else
     const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
                                                pd->subsampling_y);
+#endif  // CONFIG_EXT_RECUR_PARTITIONS || CONFIG_SDP
     unsigned int sse;
 
     if (x->skip_chroma_rd && plane) continue;
@@ -1337,7 +1343,12 @@
          tx_size_wide[tx_size] == tx_size_high[tx_size]);
 #else
   assert(cpi->sf.tx_sf.use_intra_txb_hash &&
+#if CONFIG_SDP
+         frame_is_intra_only(&cpi->common) &&
+         !is_inter_block(xd->mi[0], xd->tree_type) &&
+#else
          frame_is_intra_only(&cpi->common) && !is_inter_block(xd->mi[0]) &&
+#endif
          plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]);
 #endif
   const uint32_t intra_hash =
@@ -2421,6 +2432,11 @@
       // Therefore transform domain distortion is not valid for these
       // transform sizes.
       (txsize_sqr_up_map[tx_size] != TX_64X64) &&
+#if CONFIG_IST
+      // Use pixel domain distortion for IST
+      // TODO(any): Make IST compatible with tx domain distortion
+      !cm->seq_params.enable_ist &&
+#endif
       // Use pixel domain distortion for DC only blocks
       !dc_only_blk;
   // Flag to indicate if an extra calculation of distortion in the pixel domain
@@ -3482,7 +3498,7 @@
   if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
 #endif
     assert(!is_inter || plane_bsize < BLOCK_8X8);
-    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+    cfl_store_tx(xd, blk_row, blk_col, tx_size);
   }
 
 #if CONFIG_RD_DEBUG
@@ -4086,7 +4102,7 @@
 }
 
 int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
-                  BLOCK_SIZE bsize, int64_t ref_best_rd) {
+                  int64_t ref_best_rd) {
   av1_init_rd_stats(rd_stats);
   if (ref_best_rd < 0) return 0;
   if (!x->e_mbd.is_chroma_ref) return 1;
@@ -4100,8 +4116,13 @@
   const int is_inter = is_inter_block(mbmi);
 #endif
   int64_t this_rd = 0, skip_txfm_rd = 0;
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+#if CONFIG_SDP || CONFIG_EXT_RECUR_PARTITIONS
+  const BLOCK_SIZE plane_bsize = get_mb_plane_block_size(
+      xd, mbmi, AOM_PLANE_U, pd->subsampling_x, pd->subsampling_y);
+#else
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(
+      mbmi->chroma_ref_info.bsize_base, pd->subsampling_x, pd->subsampling_y);
+#endif  // CONFIG_SDP || CONFIG_EXT_RECUR_PARTITIONS
 
   if (is_inter) {
     for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
@@ -4273,7 +4294,7 @@
                             AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty));
     }
     const int is_cost_valid_uv =
-        av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
+        av1_txfm_uvrd(cpi, x, rd_stats_uv, ref_best_chroma_rd);
     if (!is_cost_valid_uv) return 0;
     av1_merge_rd_stats(rd_stats, rd_stats_uv);
   }
diff --git a/av1/encoder/tx_search.h b/av1/encoder/tx_search.h
index 5a5d259..f5d145f 100644
--- a/av1/encoder/tx_search.h
+++ b/av1/encoder/tx_search.h
@@ -213,13 +213,12 @@
  * \param[in]    x              Pointer to structure holding the data for the
                                 current encoding macroblock
  * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
- * \param[in]    bsize          Current macroblock size
  * \param[in]    ref_best_rd    Best RD cost seen for this block so far
  * \return       An integer value is returned. 0: early termination triggered,
                  no valid rd cost available; 1: rd cost values are valid.
  */
 int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
-                  BLOCK_SIZE bsize, int64_t ref_best_rd);
+                  int64_t ref_best_rd);
 
 /*!\brief Transform type search with fixed transform size.
  *
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index ddab3cc..3399929 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -139,7 +139,8 @@
                    "AV2 experiment flag to remove dist_wtd_comp tool.")
 set_aom_config_var(CONFIG_REMOVE_DUAL_FILTER 1
                    "AV2 experiment flag to remove dual filter.")
-
+set_aom_config_var(CONFIG_EXT_RECUR_PARTITIONS 0 NUMBER
+                   "AV2 Fully recursive partitions experiment flag")
 set_aom_config_var(CONFIG_SDP 1 NUMBER "AV2 Semi-Decoupled Partitioning.")
 set_aom_config_var(CONFIG_EXTQUANT 1
                    "AV2 extended quantization experiment flag")
diff --git a/test/intrabc_test.cc b/test/intrabc_test.cc
index b57eb6f..2081f8b 100644
--- a/test/intrabc_test.cc
+++ b/test/intrabc_test.cc
@@ -159,9 +159,7 @@
   for (const DvTestCase &dv_case : kDvCases) {
     const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset;
     const int mi_col = xd.tile.mi_col_start + dv_case.mi_col_offset;
-    xd.is_chroma_ref = is_chroma_reference(mi_row, mi_col, dv_case.bsize,
-                                           xd.plane[1].subsampling_x,
-                                           xd.plane[1].subsampling_y);
+    xd.is_chroma_ref = 1;
     EXPECT_EQ(static_cast<int>(dv_case.valid),
               av1_is_dv_valid(dv_case.dv, &cm, &xd, mi_row, mi_col,
                               dv_case.bsize, MAX_MIB_SIZE_LOG2));
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 963098e..c734789 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -2077,8 +2077,8 @@
   make_tuple(32, 32, &aom_sad32x32x4d_sse2, -1),
   make_tuple(32, 16, &aom_sad32x16x4d_sse2, -1),
   make_tuple(16, 32, &aom_sad16x32x4d_sse2, -1),
-  make_tuple(16, 16, &aom_sad16x16x4d_sse2, -1),
   make_tuple(16, 8, &aom_sad16x8x4d_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_sse2, -1),
   make_tuple(8, 16, &aom_sad8x16x4d_sse2, -1),
   make_tuple(8, 8, &aom_sad8x8x4d_sse2, -1),
   make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1),
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 2fb783d..56a3686 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -624,15 +624,15 @@
                 aom_dc_top_predictor_16x32_sse2,
                 aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
                 aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_5, TX_16X4, aom_dc_predictor_16x4_sse2,
+                aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
+                aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
+                aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL)
 INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2,
                 aom_dc_left_predictor_16x64_sse2,
                 aom_dc_top_predictor_16x64_sse2,
                 aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2,
                 aom_h_predictor_16x64_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_5, TX_16X4, aom_dc_predictor_16x4_sse2,
-                aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
-                aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
-                aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
diff --git a/test/variance_test.cc b/test/variance_test.cc
index e566f8f..3cc76cd 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -2583,8 +2583,8 @@
         SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_avx2, 0),
         SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_avx2, 0),
         SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_avx2, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_avx2,
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_avx2, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2,
                                 0)));
 #endif  // HAVE_AVX2
 
diff --git a/tools/aom_entropy_optimizer.c b/tools/aom_entropy_optimizer.c
index ff5ab22..ab279c2 100644
--- a/tools/aom_entropy_optimizer.c
+++ b/tools/aom_entropy_optimizer.c
@@ -326,14 +326,34 @@
   /* block partition */
   cts_each_dim[0] = PARTITION_CONTEXTS;
   cts_each_dim[1] = EXT_PARTITION_TYPES;
+#if CONFIG_EXT_RECUR_PARTITIONS
+  int part_types_each_ctx[PARTITION_CONTEXTS] = {
+    3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3
+  };
+#else
   int part_types_each_ctx[PARTITION_CONTEXTS] = { 4,  4,  4,  4,  10, 10, 10,
                                                   10, 10, 10, 10, 10, 10, 10,
                                                   10, 10, 8,  8,  8,  8 };
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
   optimize_cdf_table_var_modes_2d(
       &fc.partition[0][0], probsfile, 2, cts_each_dim, part_types_each_ctx,
       "static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS]"
       "[CDF_SIZE(EXT_PARTITION_TYPES)]");
 
+#if CONFIG_EXT_RECUR_PARTITIONS
+  cts_each_dim[0] = PARTITION_CONTEXTS_REC;
+  cts_each_dim[1] = PARTITION_TYPES_REC;
+  int part_types_each_ctx_rec[PARTITION_CONTEXTS_REC] = { 2, 2, 2, 2, 4, 4, 4,
+                                                          4, 4, 4, 4, 4, 4, 4,
+                                                          4, 4, 3, 3, 3, 3 };
+  optimize_cdf_table_var_modes_2d(
+      &fc.partition_rec[0][0], probsfile, 2, cts_each_dim,
+      part_types_each_ctx_rec,
+      "static const aom_cdf_prob "
+      "default_partition_rec_cdf[PARTITION_CONTEXTS_REC]"
+      "[CDF_SIZE(PARTITION_TYPES_REC)]");
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+
   /* tx type */
   cts_each_dim[0] = EXT_TX_SETS_INTRA;
   cts_each_dim[1] = EXT_TX_SIZES;