CLPF: Add quality dependent damping in the constrain function

PSNR YCbCr:  -0.17%     -0.03%     -0.40%
APSNR YCbCr: -0.17%     -0.02%     -0.39%
PSNRHVS:     -0.06%
SSIM:        -0.17%
MSSSIM:      -0.07%
CIEDE2000:   -0.12%

Change-Id: I69a4b6a4e18c22c3930069396540a6fee45cb30d
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b64de8e..eafdb2b 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -854,8 +854,8 @@
 if (aom_config("CONFIG_CDEF") eq "yes") {
   if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
-    add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd";
-    add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd";
+    add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp";
+    add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp";
     # VS compiling for 32 bit targets does not support vector types in
     # structs as arguments, which makes the v256 type of the intrinsics
     # hard to support, so optimizations for this target are disabled.
@@ -866,8 +866,8 @@
     }
   }
   add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
-  add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd";
-  add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd";
+  add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp";
+  add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int dmp";
   # VS compiling for 32 bit targets does not support vector types in
   # structs as arguments, which makes the v256 type of the intrinsics
   # hard to support, so optimizations for this target are disabled.
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 8dfe5c0..7d9933d 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -16,25 +16,25 @@
 
 int sign(int i) { return i < 0 ? -1 : 1; }
 
-int constrain(int x, int s, unsigned int bitdepth) {
+int constrain(int x, int s, unsigned int damping) {
   return sign(x) *
-         AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s + (abs(x) >> (bitdepth - 3 -
-                                                               get_msb(s)))));
+         AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s +
+                                          (abs(x) >> (damping - get_msb(s)))));
 }
 
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
-                    int H, int s, unsigned int bd) {
-  int delta = 1 * constrain(A - X, s, bd) + 3 * constrain(B - X, s, bd) +
-              1 * constrain(C - X, s, bd) + 3 * constrain(D - X, s, bd) +
-              3 * constrain(E - X, s, bd) + 1 * constrain(F - X, s, bd) +
-              3 * constrain(G - X, s, bd) + 1 * constrain(H - X, s, bd);
+                    int H, int s, unsigned int dmp) {
+  int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
+              1 * constrain(C - X, s, dmp) + 3 * constrain(D - X, s, dmp) +
+              3 * constrain(E - X, s, dmp) + 1 * constrain(F - X, s, dmp) +
+              3 * constrain(G - X, s, dmp) + 1 * constrain(H - X, s, dmp);
   return (8 + delta - (delta < 0)) >> 4;
 }
 
 void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
                       int dstride, int x0, int y0, int sizex, int sizey,
                       unsigned int strength, BOUNDARY_TYPE bt,
-                      unsigned int bitdepth) {
+                      unsigned int damping) {
   int x, y;
   const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
   const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
@@ -53,7 +53,7 @@
       const int G = src[AOMMIN(ymax, y + 1) * sstride + x];
       const int H = src[AOMMIN(ymax, y + 2) * sstride + x];
       const int delta =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth);
+          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
       dst[y * dstride + x] = X + delta;
     }
   }
@@ -64,7 +64,7 @@
 void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
                           int dstride, int x0, int y0, int sizex, int sizey,
                           unsigned int strength, BOUNDARY_TYPE bt,
-                          unsigned int bitdepth) {
+                          unsigned int damping) {
   int x, y;
   const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
   const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
@@ -83,7 +83,7 @@
       const int G = src[AOMMIN(ymax, y + 1) * sstride + x];
       const int H = src[AOMMIN(ymax, y + 2) * sstride + x];
       const int delta =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth);
+          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
       dst[y * dstride + x] = X + delta;
     }
   }
@@ -91,14 +91,13 @@
 #endif
 
 // Return number of filtered blocks
-void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
-                    const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
-                    int enable_fb_flag, unsigned int strength,
-                    unsigned int fb_size_log2, int plane,
-                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
-                                    const YV12_BUFFER_CONFIG *,
-                                    const AV1_COMMON *cm, int, int, int,
-                                    unsigned int, unsigned int, int8_t *)) {
+void av1_clpf_frame(
+    const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org,
+    AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
+    unsigned int fb_size_log2, int plane,
+    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                    const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int,
+                    int, unsigned int, unsigned int, int8_t *, int)) {
   /* Constrained low-pass filter (CLPF) */
   int c, k, l, m, n;
   const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
@@ -124,6 +123,11 @@
           ? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
           : frame->y_buffer;
   uint8_t *dst_buffer;
+  // Damping is the filter cut-off log2 point for the constrain function.
+  // For instance, if the damping is 5, neighbour differences above 32 will
+  // be ignored and half of the strength will be applied for a difference of 16.
+  int damping =
+      cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
 
 // Make buffer space for in-place filtering
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -169,7 +173,8 @@
            decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
                     fb_size_log2,
                     cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride +
-                        xoff / MIN_FB_SIZE))) {
+                        xoff / MIN_FB_SIZE,
+                    plane))) {
         // Iterate over all smaller blocks inside the filter block
         for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
           for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
@@ -260,16 +265,16 @@
                 aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
                                    CONVERT_TO_SHORTPTR(dst_buffer), sstride,
                                    dstride, xpos, ypos, sizex, sizey, strength,
-                                   boundary_type, cm->bit_depth);
+                                   boundary_type, damping);
               } else {
                 aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
                                ypos, sizex, sizey, strength, boundary_type,
-                               cm->bit_depth);
+                               damping);
               }
 #else
               aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
                              ypos, sizex, sizey, strength, boundary_type,
-                             cm->bit_depth);
+                             damping);
 #endif
             }
           }
diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index 1642fb3..b50b7a6 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@@ -19,7 +19,7 @@
 #define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2)
 
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
-                    int H, int b, unsigned int bd);
+                    int H, int b, unsigned int dmp);
 void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
                     const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
                     int enable_fb_flag, unsigned int strength,
@@ -27,6 +27,6 @@
                     int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
                                     const YV12_BUFFER_CONFIG *,
                                     const AV1_COMMON *cm, int, int, int,
-                                    unsigned int, unsigned int, int8_t *));
+                                    unsigned int, unsigned int, int8_t *, int));
 
 #endif
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index 636d2a1..b36553f 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -17,7 +17,8 @@
 // Process blocks of width 8, two lines at a time, 8 bit.
 static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
                         int dstride, int x0, int y0, int sizey,
-                        BOUNDARY_TYPE bt, unsigned int strength) {
+                        BOUNDARY_TYPE bt, unsigned int strength,
+                        unsigned int dmp) {
   const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
   const int right = !(bt & TILE_RIGHT_BOUNDARY);
   const int left = !(bt & TILE_LEFT_BOUNDARY);
@@ -68,7 +69,7 @@
       f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
     }
 
-    o = calc_delta(o, a, b, c, d, e, f, g, h, strength);
+    o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp);
     v64_store_aligned(dst, v128_high_v64(o));
     v64_store_aligned(dst + dstride, v128_low_v64(o));
     src += sstride * 2;
@@ -79,7 +80,7 @@
 // As above, but with no clipping tests
 static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride,
                                int dstride, int x0, int y0, int sizey,
-                               unsigned int strength) {
+                               unsigned int strength, unsigned int dmp) {
   int y;
 
   dst += x0 + y0 * dstride;
@@ -102,8 +103,8 @@
                                  v64_load_unaligned(src + 1 + sstride));
     const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
                                  v64_load_unaligned(src + 2 + sstride));
-    const v128 o =
-        calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, strength);
+    const v128 o = calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h,
+                              strength, dmp);
 
     v64_store_aligned(dst, v128_high_v64(o));
     v64_store_aligned(dst + dstride, v128_low_v64(o));
@@ -115,7 +116,8 @@
 // Process blocks of width 4, four lines at a time, 8 bit.
 static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
                         int dstride, int x0, int y0, int sizey,
-                        BOUNDARY_TYPE bt, unsigned int strength) {
+                        BOUNDARY_TYPE bt, unsigned int strength,
+                        unsigned int dmp) {
   const int right = !(bt & TILE_RIGHT_BOUNDARY);
   const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 4 : -1;
   const int left = !(bt & TILE_LEFT_BOUNDARY);
@@ -178,7 +180,7 @@
       f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
     }
 
-    o = calc_delta(o, a, b, c, d, e, f, g, h, strength);
+    o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp);
     u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
     u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
     u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
@@ -192,7 +194,7 @@
 // As above, but with no clipping tests
 static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride,
                                int dstride, int x0, int y0, int sizey,
-                               unsigned int strength) {
+                               unsigned int strength, unsigned int dmp) {
   int y;
 
   dst += x0 + y0 * dstride;
@@ -229,7 +231,7 @@
                                 u32_load_unaligned(src + 3 * sstride + 2));
 
     const v128 o = calc_delta(v128_from_32(l2, l3, l4, l5), a, b, c, d, e, f, g,
-                              h, strength);
+                              h, strength, dmp);
 
     u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
     u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
@@ -244,34 +246,34 @@
 void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
                                int dstride, int x0, int y0, int sizex,
                                int sizey, unsigned int strength,
-                               BOUNDARY_TYPE bt, unsigned int bd) {
+                               BOUNDARY_TYPE bt, unsigned int dmp) {
   if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
     // Fallback to C for odd sizes:
     // * block widths not 4 or 8
     // * block heights not a multiple of 4 if the block width is 4
     aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, strength,
-                     bt, bd);
+                     bt, dmp);
   } else {
     if (bt)
       (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0,
-                                               y0, sizey, bt, strength);
+                                               y0, sizey, bt, strength, dmp);
     else
       (sizex == 4 ? clpf_block4_noclip : clpf_block8_noclip)(
-          src, dst, sstride, dstride, x0, y0, sizey, strength);
+          src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
   }
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
 // sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) -
-// strength + (abs(a - b) >> (bd - 3 - log2(s)))))
+// strength + (abs(a - b) >> (dmp - log2(s)))))
 SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength,
-                               unsigned int bd) {
+                               unsigned int dmp) {
   const v128 diff = v128_sub_16(v128_max_s16(a, b), v128_min_s16(a, b));
   const v128 sign = v128_cmpeq_16(v128_min_s16(a, b), a);  // -(a <= b)
   const v128 zero = v128_zero();
   const v128 s = v128_max_s16(
       zero, v128_sub_16(v128_dup_16(strength),
-                        v128_shr_u16(diff, bd - 3 - get_msb(strength))));
+                        v128_shr_u16(diff, dmp - get_msb(strength))));
   return v128_sub_16(
       v128_xor(sign,
                v128_max_s16(
@@ -280,20 +282,21 @@
       sign);
 }
 
-// delta = 1/16 * constrain(a, x, s, bd) + 3/16 * constrain(b, x, s, bd) +
-//         1/16 * constrain(c, x, s, bd) + 3/16 * constrain(d, x, s, bd) +
-//         3/16 * constrain(e, x, s, bd) + 1/16 * constrain(f, x, s, bd) +
-//         3/16 * constrain(g, x, s, bd) + 1/16 * constrain(h, x, s, bd)
+// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
+//         1/16 * constrain(c, x, s, dmp) + 3/16 * constrain(d, x, s, dmp) +
+//         3/16 * constrain(e, x, s, dmp) + 1/16 * constrain(f, x, s, dmp) +
+//         3/16 * constrain(g, x, s, dmp) + 1/16 * constrain(h, x, s, dmp)
 SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
                                 v128 f, v128 g, v128 h, unsigned int s,
-                                unsigned int bd) {
+                                unsigned int dmp) {
   const v128 bdeg = v128_add_16(
-      v128_add_16(constrain_hbd(b, x, s, bd), constrain_hbd(d, x, s, bd)),
-      v128_add_16(constrain_hbd(e, x, s, bd), constrain_hbd(g, x, s, bd)));
+      v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(d, x, s, dmp)),
+      v128_add_16(constrain_hbd(e, x, s, dmp), constrain_hbd(g, x, s, dmp)));
   const v128 delta = v128_add_16(
       v128_add_16(
-          v128_add_16(constrain_hbd(a, x, s, bd), constrain_hbd(c, x, s, bd)),
-          v128_add_16(constrain_hbd(f, x, s, bd), constrain_hbd(h, x, s, bd))),
+          v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(c, x, s, dmp)),
+          v128_add_16(constrain_hbd(f, x, s, dmp),
+                      constrain_hbd(h, x, s, dmp))),
       v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
   return v128_add_16(
       x,
@@ -305,23 +308,23 @@
 
 static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
                             v128 f, v128 g, v128 h, uint16_t *dst,
-                            unsigned int s, unsigned int bd, int dstride) {
-  o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd);
+                            unsigned int s, unsigned int dmp, int dstride) {
+  o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
   v64_store_aligned(dst, v128_high_v64(o));
   v64_store_aligned(dst + dstride, v128_low_v64(o));
 }
 
 static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
                             v128 f, v128 g, v128 h, uint16_t *dst,
-                            unsigned int s, unsigned int bd) {
-  v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd));
+                            unsigned int s, unsigned int dmp) {
+  v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp));
 }
 
 // Process blocks of width 4, two lines at time.
 SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
                                  int sstride, int dstride, int x0, int y0,
                                  int sizey, unsigned int strength,
-                                 BOUNDARY_TYPE bt, unsigned int bd) {
+                                 BOUNDARY_TYPE bt, unsigned int dmp) {
   const int right = !(bt & TILE_RIGHT_BOUNDARY);
   const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
   const int left = !(bt & TILE_LEFT_BOUNDARY);
@@ -372,7 +375,7 @@
       e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
       f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
     }
-    calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, bd, dstride);
+    calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, dmp, dstride);
     src += sstride * 2;
     dst += dstride * 2;
   }
@@ -383,7 +386,7 @@
                                         int sstride, int dstride, int x0,
                                         int y0, int sizey,
                                         unsigned int strength,
-                                        unsigned int bd) {
+                                        unsigned int dmp) {
   int y;
 
   dst += x0 + y0 * dstride;
@@ -408,7 +411,7 @@
                                  v64_load_unaligned(src + 2 + sstride));
 
     calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
-                    strength, bd, dstride);
+                    strength, dmp, dstride);
     src += sstride * 2;
     dst += dstride * 2;
   }
@@ -418,7 +421,7 @@
 SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
                                 int dstride, int x0, int y0, int sizey,
                                 unsigned int strength, BOUNDARY_TYPE bt,
-                                unsigned int bd) {
+                                unsigned int dmp) {
   const int right = !(bt & TILE_RIGHT_BOUNDARY);
   const int left = !(bt & TILE_LEFT_BOUNDARY);
   const int ymin = -!(bt & TILE_ABOVE_BOUNDARY) * 2;
@@ -463,7 +466,7 @@
       e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
       f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
     }
-    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd);
+    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp);
     dst += dstride;
   }
 }
@@ -472,7 +475,7 @@
 SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst,
                                        int sstride, int dstride, int x0, int y0,
                                        int sizey, unsigned int strength,
-                                       unsigned int bd) {
+                                       unsigned int dmp) {
   int y;
 
   dst += x0 + y0 * dstride;
@@ -489,7 +492,7 @@
     const v128 e = v128_load_unaligned(src + 1);
     const v128 f = v128_load_unaligned(src + 2);
 
-    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd);
+    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp);
     src += sstride;
     dst += dstride;
   }
@@ -498,20 +501,20 @@
 void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
                                    int sstride, int dstride, int x0, int y0,
                                    int sizex, int sizey, unsigned int strength,
-                                   BOUNDARY_TYPE bt, unsigned int bd) {
+                                   BOUNDARY_TYPE bt, unsigned int dmp) {
   if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
     // Fallback to C for odd sizes:
     // * block width not 4 or 8
     // * block heights not a multiple of 2 if the block width is 4
     aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
-                         strength, bt, bd);
+                         strength, bt, dmp);
   } else {
     if (bt)
       (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
-          src, dst, sstride, dstride, x0, y0, sizey, strength, bt, bd);
+          src, dst, sstride, dstride, x0, y0, sizey, strength, bt, dmp);
     else
       (sizex == 4 ? clpf_block_hbd4_noclip : clpf_block_hbd_noclip)(
-          src, dst, sstride, dstride, x0, y0, sizey, strength, bd);
+          src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
   }
 }
 #endif
diff --git a/av1/common/clpf_simd_kernel.h b/av1/common/clpf_simd_kernel.h
index 5412746..92ca340 100644
--- a/av1/common/clpf_simd_kernel.h
+++ b/av1/common/clpf_simd_kernel.h
@@ -16,11 +16,12 @@
 
 // sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) -
 // strength + (abs(a - b) >> (5 - log2(s)))))
-SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength) {
+SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength,
+                           unsigned int damping) {
   const v128 diff = v128_sub_8(v128_max_u8(a, b), v128_min_u8(a, b));
   const v128 sign = v128_cmpeq_8(v128_min_u8(a, b), a);  // -(a <= b)
   const v128 s = v128_ssub_u8(v128_dup_8(strength),
-                              v128_shr_u8(diff, 5 - get_msb(strength)));
+                              v128_shr_u8(diff, damping - get_msb(strength)));
   return v128_sub_8(v128_xor(sign, v128_ssub_u8(diff, v128_ssub_u8(diff, s))),
                     sign);
 }
@@ -30,14 +31,15 @@
 //         3/16 * constrain(e, x, s) + 1/16 * constrain(f, x, s) +
 //         3/16 * constrain(g, x, s) + 1/16 * constrain(h, x, s)
 SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
-                            v128 f, v128 g, v128 h, unsigned int s) {
+                            v128 f, v128 g, v128 h, unsigned int s,
+                            unsigned int dmp) {
   const v128 bdeg =
-      v128_add_8(v128_add_8(constrain(b, x, s), constrain(d, x, s)),
-                 v128_add_8(constrain(e, x, s), constrain(g, x, s)));
-  const v128 delta =
-      v128_add_8(v128_add_8(v128_add_8(constrain(a, x, s), constrain(c, x, s)),
-                            v128_add_8(constrain(f, x, s), constrain(h, x, s))),
-                 v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
+      v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
+                 v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
+  const v128 delta = v128_add_8(
+      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
+                 v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
+      v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
   return v128_add_8(
       x, v128_shr_s8(
              v128_add_8(v128_dup_8(8),
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 880ee83..57849cb 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -2746,7 +2746,8 @@
                     UNUSED const YV12_BUFFER_CONFIG *org,
                     UNUSED const AV1_COMMON *cm, UNUSED int block_size,
                     UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
-                    UNUSED unsigned int fb_size_log2, int8_t *bit) {
+                    UNUSED unsigned int fb_size_log2, int8_t *bit,
+                    UNUSED int plane) {
   return *bit;
 }
 
diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
index 4ea3989..3ef67cc 100644
--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c
@@ -19,7 +19,7 @@
 void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
                        int ostride, int x0, int y0, int width, int height,
                        int *sum0, int *sum1, unsigned int strength, int size,
-                       unsigned int bd) {
+                       unsigned int dmp) {
   int x, y;
   for (y = y0; y < y0 + size; y++) {
     for (x = x0; x < x0 + size; x++) {
@@ -34,7 +34,7 @@
       const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
       const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
       const int delta =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bd);
+          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, dmp);
       const int Y = X + delta;
       *sum0 += (O - X) * (O - X);
       *sum1 += (O - Y) * (O - Y);
@@ -45,7 +45,7 @@
 void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
                              int rstride, int ostride, int x0, int y0,
                              int width, int height, int *sum, int size,
-                             unsigned int bd) {
+                             unsigned int dmp) {
   int x, y;
 
   for (y = y0; y < y0 + size; y++) {
@@ -60,9 +60,9 @@
       const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)];
       const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
       const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
-      const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd);
-      const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd);
-      const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd);
+      const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, dmp);
+      const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, dmp);
+      const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, dmp);
       const int F1 = X + delta1;
       const int F2 = X + delta2;
       const int F3 = X + delta3;
@@ -79,7 +79,8 @@
 void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
                            int rstride, int ostride, int x0, int y0, int width,
                            int height, int *sum0, int *sum1,
-                           unsigned int strength, int size, unsigned int bd) {
+                           unsigned int strength, int size, unsigned int bd,
+                           unsigned int dmp) {
   const int shift = bd - 8;
   int x, y;
   for (y = y0; y < y0 + size; y++) {
@@ -95,7 +96,7 @@
       const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
       const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
       const int delta = av1_clpf_sample(X, A, B, C, D, E, F, G, H,
-                                        strength >> shift, bd - shift);
+                                        strength >> shift, dmp - shift);
       const int Y = X + delta;
       *sum0 += (O - X) * (O - X);
       *sum1 += (O - Y) * (O - Y);
@@ -107,7 +108,7 @@
 void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
                                  int rstride, int ostride, int x0, int y0,
                                  int width, int height, int *sum, int size,
-                                 unsigned int bd) {
+                                 unsigned int bd, unsigned int dmp) {
   const int shift = bd - 8;
   int x, y;
 
@@ -124,11 +125,11 @@
       const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
       const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
       const int delta1 =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd - shift);
+          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, dmp - shift);
       const int delta2 =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd - shift);
+          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, dmp - shift);
       const int delta3 =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd - shift);
+          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, dmp - shift);
       const int F1 = X + delta1;
       const int F2 = X + delta2;
       const int F3 = X + delta3;
@@ -144,8 +145,10 @@
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, int8_t *res) {
+                      unsigned int fb_size_log2, int8_t *res, int plane) {
   int m, n, sum0 = 0, sum1 = 0;
+  int damping =
+      cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
 
   for (m = 0; m < h; m++) {
     for (n = 0; n < w; n++) {
@@ -160,18 +163,18 @@
                               CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
                               org->y_stride, xpos, ypos, rec->y_crop_width,
                               rec->y_crop_height, &sum0, &sum1, strength,
-                              block_size, cm->bit_depth);
+                              block_size, cm->bit_depth, damping);
         } else {
           aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
                           org->y_stride, xpos, ypos, rec->y_crop_width,
                           rec->y_crop_height, &sum0, &sum1, strength,
-                          block_size, cm->bit_depth);
+                          block_size, damping);
         }
 #else
         aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
                         org->y_stride, xpos, ypos, rec->y_crop_width,
                         rec->y_crop_height, &sum0, &sum1, strength, block_size,
-                        cm->bit_depth);
+                        damping);
 #endif
       }
     }
@@ -214,6 +217,9 @@
       plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
   int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
   int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
+  int damping =
+      cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
+
   sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0;
   if (plane == AOM_PLANE_Y &&
       fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
@@ -270,19 +276,19 @@
                 ->mbmi.skip;
 #if CONFIG_AOM_HIGHBITDEPTH
       if (cm->use_highbitdepth) {
-        aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec_buffer),
-                                  CONVERT_TO_SHORTPTR(org_buffer), rec_stride,
-                                  org_stride, xpos, ypos, rec_width, rec_height,
-                                  sum + skip, block_size, cm->bit_depth);
+        aom_clpf_detect_multi_hbd(
+            CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
+            rec_stride, org_stride, xpos, ypos, rec_width, rec_height,
+            sum + skip, block_size, cm->bit_depth, damping);
       } else {
         aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
                               xpos, ypos, rec_width, rec_height, sum + skip,
-                              block_size, cm->bit_depth);
+                              block_size, damping);
       }
 #else
       aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
                             xpos, ypos, rec_width, rec_height, sum + skip,
-                            block_size, cm->bit_depth);
+                            block_size, damping);
 #endif
       filtered |= !skip;
     }
diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
index 586eed0..f92f7d2 100644
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h
@@ -17,7 +17,7 @@
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, int8_t *res);
+                      unsigned int fb_size_log2, int8_t *res, int plane);
 
 void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
diff --git a/av1/encoder/clpf_rdo_simd.h b/av1/encoder/clpf_rdo_simd.h
index 177359f..5f61997 100644
--- a/av1/encoder/clpf_rdo_simd.h
+++ b/av1/encoder/clpf_rdo_simd.h
@@ -69,7 +69,7 @@
                                 int rstride, int ostride, int x0, int y0,
                                 int width, int height, int *sum0, int *sum1,
                                 unsigned int strength, int size,
-                                unsigned int bd) {
+                                unsigned int dmp) {
   const int bottom = height - 2 - y0;
   const int right = width - 8 - x0;
   ssd128_internal ssd0 = v128_ssd_u8_init();
@@ -78,7 +78,7 @@
 
   if (size != 8) {  // Fallback to plain C
     aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
-                      sum1, strength, size, bd);
+                      sum1, strength, size, dmp);
     return;
   }
 
@@ -90,8 +90,8 @@
     read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
                    &a, &b, &c, &d, &e, &f, &g, &h);
     ssd0 = v128_ssd_u8(ssd0, o, r);
-    ssd1 =
-        v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, strength));
+    ssd1 = v128_ssd_u8(ssd1, o,
+                       calc_delta(r, a, b, c, d, e, f, g, h, strength, dmp));
     rec += rstride * 2;
     org += ostride * 2;
   }
@@ -102,17 +102,17 @@
 SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
                                   v128 d, v128 e, v128 f, v128 g, v128 h,
                                   ssd128_internal *ssd1, ssd128_internal *ssd2,
-                                  ssd128_internal *ssd3) {
-  *ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1));
-  *ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2));
-  *ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4));
+                                  ssd128_internal *ssd3, unsigned int dmp) {
+  *ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1, dmp));
+  *ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2, dmp));
+  *ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4, dmp));
 }
 
 // Test multiple filter strengths at once.
 void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
                                       int rstride, int ostride, int x0, int y0,
                                       int width, int height, int *sum, int size,
-                                      unsigned int bd) {
+                                      unsigned int dmp) {
   const int bottom = height - 2 - y0;
   const int right = width - 8 - x0;
   ssd128_internal ssd0 = v128_ssd_u8_init();
@@ -123,7 +123,7 @@
 
   if (size != 8) {  // Fallback to plain C
     aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
-                            sum, size, bd);
+                            sum, size, dmp);
     return;
   }
 
@@ -135,7 +135,7 @@
     read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
                    &a, &b, &c, &d, &e, &f, &g, &h);
     ssd0 = v128_ssd_u8(ssd0, o, r);
-    calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3);
+    calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3, dmp);
     rec += 2 * rstride;
     org += 2 * ostride;
   }
@@ -190,7 +190,8 @@
                                     int rstride, int ostride, int x0, int y0,
                                     int width, int height, int *sum0, int *sum1,
                                     unsigned int strength, int size,
-                                    unsigned int bitdepth) {
+                                    unsigned int bitdepth,
+                                    unsigned int damping) {
   const int shift = bitdepth - 8;
   const int bottom = height - 2 - y0;
   const int right = width - 8 - x0;
@@ -200,7 +201,7 @@
 
   if (size != 8) {  // Fallback to plain C
     aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
-                          sum0, sum1, strength, size, bitdepth);
+                          sum0, sum1, strength, size, bitdepth, damping);
     return;
   }
 
@@ -212,8 +213,8 @@
     read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
                        &r, &a, &b, &c, &d, &e, &f, &g, &h, shift);
     ssd0 = v128_ssd_u8(ssd0, o, r);
-    ssd1 = v128_ssd_u8(
-        ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, strength >> shift));
+    ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h,
+                                           strength >> shift, damping));
     rec += rstride * 2;
     org += ostride * 2;
   }
@@ -225,7 +226,8 @@
                                           const uint16_t *org, int rstride,
                                           int ostride, int x0, int y0,
                                           int width, int height, int *sum,
-                                          int size, unsigned int bitdepth) {
+                                          int size, unsigned int bitdepth,
+                                          unsigned int damping) {
   const int bottom = height - 2 - y0;
   const int right = width - 8 - x0;
   ssd128_internal ssd0 = v128_ssd_u8_init();
@@ -236,7 +238,7 @@
 
   if (size != 8) {  // Fallback to plain C
     aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
-                                height, sum, size, bitdepth);
+                                height, sum, size, bitdepth, damping);
     return;
   }
 
@@ -248,7 +250,8 @@
     read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
                        &r, &a, &b, &c, &d, &e, &f, &g, &h, bitdepth - 8);
     ssd0 = v128_ssd_u8(ssd0, o, r);
-    calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3);
+    calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3,
+                     damping);
     rec += rstride * 2;
     org += ostride * 2;
   }