Compute gradient data once at superblock level

This CL enables the generation of gradient data at superblock level
and the consumption of the same at block level when
partition_search_type is SEARCH_PARTITION. This optimization is
enabled only for intra frames. This design avoids repetitive
calculations of gradient data at pixel level as it is calculated
based on source pixels.

For AVIF still image encode,

           Instruction Count
  speed      Reduction(%)
    0           0.863
    1           1.500
    2           1.217
    3           1.812
    4           2.844
    5           3.242
    6           0.470

BUG=aomedia:2996
BUG=aomedia:2959

Change-Id: I0cf14b50c5598654dd6419b2793908c810ec7cbe
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 0d5d383..6cdc2c5 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -633,6 +633,10 @@
                                  sizeof(*thread_data->td->tmp_pred_bufs[j])));
       }
 
+      CHECK_MEM_ERROR(cm, thread_data->td->pixel_gradient_info,
+                      aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) *
+                                 PLANE_TYPES * MAX_SB_SQUARE));
+
       if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
         const int num_64x64_blocks =
             (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
@@ -873,6 +877,8 @@
         thread_data->td->mb.tmp_pred_bufs[j] =
             thread_data->td->tmp_pred_bufs[j];
       }
+      thread_data->td->mb.pixel_gradient_info =
+          thread_data->td->pixel_gradient_info;
 
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {