Add a flag to use quantization matrices during RD search.

The flag "dist-metric", when set to "qm-psnr", will weight the error in
transform space by the quantization matrix; this should be a
psychovisual improvement. Not setting the flag, or setting it to "psnr",
will not change the current behaviour.

Note that this flag is not tested (and may not work) with 64-point
transforms, as those transforms discard some coefficients before
quantization.

BUG=aomedia:3209

Change-Id: Ied5a1d9bb127263de19becf7302b37219f70a324
diff --git a/aom/aomcx.h b/aom/aomcx.h
index a263f15..0e4e12a 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1545,6 +1545,23 @@
   AOM_TUNE_BUTTERAUGLI = 8,
 } aom_tune_metric;
 
+/*!\brief Distortion metric to use for RD optimization.
+ *
+ * Changes the encoder to use a different distortion metric for RD search. Note
+ * that this value operates on a "lower level" compared to aom_tune_metric - it
+ * affects the distortion metric inside a block, while aom_tune_metric only
+ * affects RD across blocks.
+ *
+ */
+typedef enum {
+  // Use PSNR for in-block rate-distortion optimization.
+  AOM_DIST_METRIC_PSNR,
+  // Use quantization matrix-weighted PSNR for in-block rate-distortion
+  // optimization. If --enable-qm=1 is not specified, this falls back to
+  // behaving in the same way as AOM_DIST_METRIC_PSNR.
+  AOM_DIST_METRIC_QM_PSNR,
+} aom_dist_metric;
+
 #define AOM_MAX_LAYERS 32   /**< Max number of layers */
 #define AOM_MAX_SS_LAYERS 4 /**< Max number of spatial layers */
 #define AOM_MAX_TS_LAYERS 8 /**< Max number of temporal layers */
diff --git a/apps/aomenc.c b/apps/aomenc.c
index 914ecec..f5c4a23 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -453,6 +453,7 @@
   &g_av1_codec_arg_defs.second_pass_log,
   &g_av1_codec_arg_defs.fwd_kf_dist,
   &g_av1_codec_arg_defs.strict_level_conformance,
+  &g_av1_codec_arg_defs.dist_metric,
   NULL,
 };
 
diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index c70f059..bf5d6b6 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c
@@ -50,6 +50,12 @@
   { NULL, 0 }
 };
 
+static const struct arg_enum_list dist_metric_enum[] = {
+  { "psnr", AOM_DIST_METRIC_PSNR },
+  { "qm-psnr", AOM_DIST_METRIC_QM_PSNR },
+  { NULL, 0 }
+};
+
 #if CONFIG_AV1_ENCODER
 static const struct arg_enum_list timing_info_enum[] = {
   { "unspecified", AOM_TIMING_UNSPECIFIED },
@@ -284,6 +290,9 @@
       ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"),
   .tune_metric = ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with",
                               tuning_enum),
+  .dist_metric = ARG_DEF_ENUM(
+      NULL, "dist-metric", 1,
+      "Distortion metric to use for in-block optimization", dist_metric_enum),
   .cq_level =
       ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"),
   .max_intra_rate_pct =
diff --git a/av1/arg_defs.h b/av1/arg_defs.h
index 4816b5e..6e34638 100644
--- a/av1/arg_defs.h
+++ b/av1/arg_defs.h
@@ -114,6 +114,7 @@
   arg_def_t arnr_maxframes;
   arg_def_t arnr_strength;
   arg_def_t tune_metric;
+  arg_def_t dist_metric;
   arg_def_t cq_level;
   arg_def_t max_intra_rate_pct;
 #if CONFIG_AV1_ENCODER
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 5fcd6fc..6a44322 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -54,6 +54,7 @@
   aom_tune_metric tuning;
   const char *vmaf_model_path;
   const char *partition_info_path;
+  aom_dist_metric dist_metric;
   unsigned int cq_level;  // constrained quality level
   unsigned int rc_max_intra_bitrate_pct;
   unsigned int rc_max_inter_bitrate_pct;
@@ -227,6 +228,7 @@
   AOM_TUNE_PSNR,  // tuning
   "/usr/local/share/model/vmaf_v0.6.1.json",  // VMAF model path
   ".",                                        // partition info path
+  AOM_DIST_METRIC_PSNR,                       // dist_metric
   10,                                         // cq_level
   0,                                          // rc_max_intra_bitrate_pct
   0,                                          // rc_max_inter_bitrate_pct
@@ -371,6 +373,7 @@
   AOM_TUNE_PSNR,  // tuning
   "/usr/local/share/model/vmaf_v0.6.1.json",  // VMAF model path
   ".",                                        // partition info path
+  AOM_DIST_METRIC_PSNR,                       // dist_metric
   10,                                         // cq_level
   0,                                          // rc_max_intra_bitrate_pct
   0,                                          // rc_max_inter_bitrate_pct
@@ -797,6 +800,9 @@
 
   RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_BUTTERAUGLI);
 
+  RANGE_CHECK(extra_cfg, dist_metric, AOM_DIST_METRIC_PSNR,
+              AOM_DIST_METRIC_QM_PSNR);
+
   RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
               AOM_TIMING_DEC_MODEL);
 
@@ -1234,6 +1240,7 @@
     tune_cfg->film_grain_test_vector = extra_cfg->film_grain_test_vector;
     tune_cfg->film_grain_table_filename = extra_cfg->film_grain_table_filename;
   }
+  tune_cfg->dist_metric = extra_cfg->dist_metric;
 #if CONFIG_DENOISE
   oxcf->noise_level = extra_cfg->noise_level;
   oxcf->noise_block_size = extra_cfg->noise_block_size;
@@ -3599,6 +3606,9 @@
                             argv, err_string)) {
     err = allocate_and_set_string(value, default_extra_cfg.partition_info_path,
                                   &extra_cfg.partition_info_path, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dist_metric, argv,
+                              err_string)) {
+    extra_cfg.dist_metric = arg_parse_enum_helper(&arg, err_string);
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv,
                               err_string)) {
     extra_cfg.cq_level = arg_parse_uint_helper(&arg, err_string);
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index fc711b9..673192e 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -437,6 +437,12 @@
    * Flag to enable/disable DC block prediction.
    */
   unsigned int predict_dc_level;
+
+  /*!
+   * Whether or not we should use the quantization matrix as weights for PSNR
+   * during RD search.
+   */
+  int use_qm_dist_metric;
 } TxfmSearchParams;
 
 /*!\cond */
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 06bc0c6..aeb277a 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -761,6 +761,8 @@
   aom_tune_content content;
   // Indicates the film grain parameters.
   int film_grain_test_vector;
+  // Indicates the in-block distortion metric to use.
+  aom_dist_metric dist_metric;
 } TuneCfg;
 
 typedef struct {
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 7076f91..827d843 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -462,6 +462,14 @@
 static INLINE void set_tx_domain_dist_params(
     const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params,
     int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
+  if (txfm_params->use_qm_dist_metric) {
+    // QM-weighted PSNR is computed in transform space, so we need to forcibly
+    // enable the use of tx domain distortion.
+    txfm_params->use_transform_domain_distortion = 1;
+    txfm_params->tx_domain_dist_threshold = 0;
+    return;
+  }
+
   if (!enable_winner_mode_for_tx_domain_dist) {
     txfm_params->use_transform_domain_distortion =
         winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL];
@@ -492,6 +500,9 @@
   const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params;
   TxfmSearchParams *txfm_params = &x->txfm_search_params;
 
+  txfm_params->use_qm_dist_metric =
+      cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR;
+
   switch (mode_eval_type) {
     case DEFAULT_EVAL:
       txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 79c21a6..e24800b 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -1046,8 +1046,38 @@
   }
 }
 
+static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff,
+                                         const tran_low_t *dqcoeff,
+                                         intptr_t block_size,
+                                         const qm_val_t *qmatrix,
+                                         const int16_t *scan, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    int64_t weight = qmatrix[scan[i]];
+    int64_t dd = coeff[i] - dqcoeff[i];
+    dd *= weight;
+    int64_t cc = coeff[i];
+    cc *= weight;
+    // The ranges of coeff and dqcoeff are
+    //  bd8 : 18 bits (including sign)
+    //  bd10: 20 bits (including sign)
+    //  bd12: 22 bits (including sign)
+    // As AOM_QM_BITS is 5, the intermediate quantities in the calculation
+    // below should fit in 54 bits, thus no overflow should happen.
+    error += (dd * dd + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+    sqcoeff += (cc * cc + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
 static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
-                                        TX_SIZE tx_size, int64_t *out_dist,
+                                        TX_SIZE tx_size,
+                                        const qm_val_t *qmatrix,
+                                        const int16_t *scan, int64_t *out_dist,
                                         int64_t *out_sse) {
   const struct macroblock_plane *const p = &x->plane[plane];
   // Transform domain distortion computation is more efficient as it does
@@ -1062,12 +1092,21 @@
   tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 #if CONFIG_AV1_HIGHBITDEPTH
   MACROBLOCKD *const xd = &x->e_mbd;
-  if (is_cur_buf_hbd(xd))
+  if (is_cur_buf_hbd(xd)) {
+    // TODO(veluca): handle use_qm_dist_metric for HBD too.
     *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
                                        xd->bd);
-  else
+  } else {
 #endif
-    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+    if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) {
+      *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+    } else {
+      *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix,
+                                     scan, &this_sse);
+    }
+#if CONFIG_AV1_HIGHBITDEPTH
+  }
+#endif
 
   *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
   *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
@@ -1129,7 +1168,10 @@
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
 
-    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
 
     rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
                                               txb_ctx, reduced_tx_set_used, 0);
@@ -1162,7 +1204,10 @@
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
 
-    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
 
     rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
                                               txb_ctx, reduced_tx_set_used, 0);
@@ -1254,7 +1299,10 @@
     rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
                                               txb_ctx, reduced_tx_set_used, 0);
     // tx domain dist
-    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
 
     txk_map[num_cand] = tx_type;
     rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist);
@@ -2110,6 +2158,8 @@
 
     // Calculate rate cost of quantized coefficients.
     if (quant_param.use_optimize_b) {
+      // TODO(aomedia:3209): update Trellis quantization to take into account
+      // quantization matrices.
       av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
                      &rate_cost);
     } else {
@@ -2130,7 +2180,10 @@
       this_rd_stats.dist = dist_block_px_domain(
           cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
     } else if (use_transform_domain_distortion) {
-      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+      const SCAN_ORDER *const scan_order =
+          get_scan(txfm_param.tx_size, txfm_param.tx_type);
+      dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                           scan_order->scan, &this_rd_stats.dist,
                            &this_rd_stats.sse);
     } else {
       int64_t sse_diff = INT64_MAX;
@@ -2147,7 +2200,10 @@
         // to decide if we should do pixel domain distortion. If the energy
         // is mostly in first quadrant, then it is unlikely that we have
         // overflow issue in inverse transform.
-        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+        const SCAN_ORDER *const scan_order =
+            get_scan(txfm_param.tx_size, txfm_param.tx_type);
+        dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                             scan_order->scan, &this_rd_stats.dist,
                              &this_rd_stats.sse);
         sse_diff = block_sse - this_rd_stats.sse;
       }
@@ -2917,7 +2973,10 @@
       this_rd_stats.rate =
           cost_coeffs(x, 0, i, tx_size, txfm_param.tx_type, &txb_ctx, 0);
 
-      dist_block_tx_domain(x, 0, i, tx_size, &this_rd_stats.dist,
+      const SCAN_ORDER *const scan_order =
+          get_scan(txfm_param.tx_size, txfm_param.tx_type);
+      dist_block_tx_domain(x, 0, i, tx_size, quant_param.qmatrix,
+                           scan_order->scan, &this_rd_stats.dist,
                            &this_rd_stats.sse);
 
       const int64_t no_skip_txfm_rd =
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index b2f21e5..84ca64c 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -151,6 +151,11 @@
   }
 #endif
 
+  void SetOption(const char *name, const char *value) {
+    const aom_codec_err_t res = aom_codec_set_option(&encoder_, name, value);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
   void Config(const aom_codec_enc_cfg_t *cfg) {
     const aom_codec_err_t res = aom_codec_enc_config_set(&encoder_, cfg);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
diff --git a/test/end_to_end_qmpsnr_test.cc b/test/end_to_end_qmpsnr_test.cc
new file mode 100644
index 0000000..84a5c78
--- /dev/null
+++ b/test/end_to_end_qmpsnr_test.cc
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/ssim.h"
+#include "av1/common/blockd.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const unsigned int kCqLevel = 18;
+// List of ssim thresholds for speed settings 0-8 with all intra encoding mode.
+const double kSsimThreshold[] = { 83.4, 83.4, 83.4, 83.3, 83.3,
+                                  83.0, 82.3, 81.1, 81.1 };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// This class is used to check adherence to given ssim value, while using the
+// "dist-metric=qm-psnr" option.
+class EndToEndQMPSNRTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  EndToEndQMPSNRTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
+        ssim_(0.0) {}
+
+  ~EndToEndQMPSNRTest() override {}
+
+  void SetUp() override { InitializeConfig(encoding_mode_); }
+
+  void BeginPassHook(unsigned int) override {
+    nframes_ = 0;
+    ssim_ = 0.0;
+  }
+
+  void CalculateFrameLevelSSIM(const aom_image_t *img_src,
+                               const aom_image_t *img_enc,
+                               aom_bit_depth_t bit_depth,
+                               unsigned int input_bit_depth) override {
+    double frame_ssim;
+    double plane_ssim[MAX_MB_PLANE] = { 0.0, 0.0, 0.0 };
+    int crop_widths[PLANE_TYPES];
+    int crop_heights[PLANE_TYPES];
+    crop_widths[PLANE_TYPE_Y] = img_src->d_w;
+    crop_heights[PLANE_TYPE_Y] = img_src->d_h;
+    // Width of UV planes calculated based on chroma_shift values.
+    crop_widths[PLANE_TYPE_UV] =
+        img_src->x_chroma_shift == 1 ? (img_src->w + 1) >> 1 : img_src->w;
+    crop_heights[PLANE_TYPE_UV] =
+        img_src->y_chroma_shift == 1 ? (img_src->h + 1) >> 1 : img_src->h;
+    nframes_++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    uint8_t is_hbd = bit_depth > AOM_BITS_8;
+    if (is_hbd) {
+      // HBD ssim calculation.
+      uint8_t shift = bit_depth - input_bit_depth;
+      for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+        const int is_uv = i > AOM_PLANE_Y;
+        plane_ssim[i] = aom_highbd_ssim2(
+            CONVERT_TO_BYTEPTR(img_src->planes[i]),
+            CONVERT_TO_BYTEPTR(img_enc->planes[i]),
+            img_src->stride[is_uv] >> is_hbd, img_enc->stride[is_uv] >> is_hbd,
+            crop_widths[is_uv], crop_heights[is_uv], input_bit_depth, shift);
+      }
+      frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                   .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+      // Accumulate to find sequence level ssim value.
+      ssim_ += frame_ssim;
+      return;
+    }
+#else
+    (void)bit_depth;
+    (void)input_bit_depth;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+    // LBD ssim calculation.
+    for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+      const int is_uv = i > AOM_PLANE_Y;
+      plane_ssim[i] = aom_ssim2(img_src->planes[i], img_enc->planes[i],
+                                img_src->stride[is_uv], img_enc->stride[is_uv],
+                                crop_widths[is_uv], crop_heights[is_uv]);
+    }
+    frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                 .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+    // Accumulate to find sequence level ssim value.
+    ssim_ += frame_ssim;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_TUNING, AOM_TUNE_SSIM);
+      encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      encoder->SetOption("dist-metric", "qm-psnr");
+    }
+  }
+
+  double GetAverageSsim() const {
+    if (nframes_) return 100 * pow(ssim_ / nframes_, 8.0);
+    return 0.0;
+  }
+
+  double GetSsimThreshold() { return kSsimThreshold[cpu_used_]; }
+
+  void DoTest() {
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                        kFrames));
+    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double ssim = GetAverageSsim();
+    EXPECT_GT(ssim, GetSsimThreshold())
+        << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_;
+  }
+
+ private:
+  const libaom_test::TestMode encoding_mode_;
+  const TestVideoParam test_video_param_;
+  const int cpu_used_;
+  unsigned int nframes_;
+  double ssim_;
+};
+
+class EndToEndQMPSNRTestLarge : public EndToEndQMPSNRTest {};
+
+TEST_P(EndToEndQMPSNRTestLarge, EndtoEndQMPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndQMPSNRTest, EndtoEndQMPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndQMPSNRTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(2, 4, 6, 8));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndQMPSNRTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(kTestVectors[0]),  // 420
+                           ::testing::Values(6));               // cpu_used
+}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 951881e..cb2293e 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -262,6 +262,7 @@
               "${AOM_ROOT}/test/comp_mask_variance_test.cc"
               "${AOM_ROOT}/test/encodemb_test.cc"
               "${AOM_ROOT}/test/encodetxb_test.cc"
+              "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
               "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
               "${AOM_ROOT}/test/error_block_test.cc"
               "${AOM_ROOT}/test/fft_test.cc"
@@ -294,6 +295,7 @@
 
   if(CONFIG_REALTIME_ONLY)
     list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                     "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
                      "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
                      "${AOM_ROOT}/test/firstpass_test.cc"
                      "${AOM_ROOT}/test/frame_error_test.cc"