Fix wrong place of setting dst with PVQ in intra 4x4

With PVQ, the dst buffer should be initialized as zero
before av1_inv_txfm_add_*() is called.
This bug seems introduced during resolving conflicts
when nextgenv2 was merged.

BD-Rate change:
                PSNR  PSNR-HVS  SSIM  CIEDE 2000  MS SSIM
subset1-mono    -0.25 -0.25     -0.23 -0.26       -0.23
objective1-fast -0.17 -0.26     -0.14 -0.04       -0.18

Change-Id: I7c6b793ba0aa5f1e3d419312cbbe5c207a68f1f8
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 4effdf9..3617f05 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2329,9 +2329,17 @@
                                        tx_type, &rate_pvq, x->pvq_speed, NULL);
           ratey += rate_pvq;
 #endif
+#if CONFIG_PVQ
+          if (!skip) {
+            for (j = 0; j < tx_blk_size; j++)
+              for (i = 0; i < tx_blk_size; i++) dst[j * dst_stride + i] = 0;
+#endif
+            av1_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
+                                 dst_stride, p->eobs[block], tx_type, 0);
+#if CONFIG_PVQ
+          }
+#endif
           // No need for av1_block_error2_c because the ssz is unused
-          av1_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
-                               dst_stride, p->eobs[block], tx_type, 0);
           cpi->fn_ptr[BLOCK_4X4].vf(src, src_stride, dst, dst_stride, &tmp);
           dist = (int64_t)tmp << 4;
           distortion += dist;
@@ -2340,14 +2348,6 @@
           // in the frequency domain, the overhead of encoding effort is low.
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
-#if CONFIG_PVQ
-          if (!skip) {
-            for (j = 0; j < tx_blk_size; j++)
-              for (i = 0; i < tx_blk_size; i++) dst[j * dst_stride + i] = 0;
-#endif
-#if CONFIG_PVQ
-          }
-#endif
         }
       }
     }