/*
 * Copyright (c) 2025, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 3-Clause Clear License
 * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
 * License was not distributed with this source code in the LICENSE file, you
 * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
 * Alliance for Open Media Patent License 1.0 was not distributed with this
 * source code in the PATENTS file, you can obtain it at
 * aomedia.org/license/patent-license/.
 */
#ifndef AOM_COMMON_GDF_H_
#define AOM_COMMON_GDF_H_

#include "av1/common/gdf.h"
#include "av1/common/gdf_block.h"

static int gdf_num_stripes_in_tile(int stripe_size, int tile_size) {
  const int first_stripe_offset = GDF_TEST_STRIPE_OFF;
  return (tile_size + first_stripe_offset + stripe_size - 1) / stripe_size;
}

#ifndef NDEBUG
static int gdf_get_frame_stripe_from_row(AV1_COMMON *const cm, int row) {
#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
  if (cm->seq_params.disable_loopfilters_across_tiles) {
    const int mi_row = row >> MI_SIZE_LOG2;
    const int tile_row = get_tile_row_from_mi_row(&cm->tiles, mi_row);
    int fs = 0;
    for (int tr = 0; tr < tile_row; tr++)
      fs += cm->gdf_info.gdf_vert_stripes_per_tile[tr];
    const int tile_row_start = cm->tiles.row_start_sb[tile_row]
                               << cm->tiles.mib_size_log2;
    row -= (tile_row_start << MI_SIZE_LOG2);
    return fs + (row + GDF_TEST_STRIPE_OFF) / cm->gdf_info.gdf_unit_size;
  } else {
    return (row + GDF_TEST_STRIPE_OFF) / cm->gdf_info.gdf_unit_size;
  }
#else
  return (row + GDF_TEST_STRIPE_OFF) / cm->gdf_info.gdf_unit_size;
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
}
#endif  // NDEBUG

void init_gdf_test(GdfInfo *gi, int mib_size, int rec_height, int rec_width) {
  gi->gdf_mode = 0;
  gi->gdf_pic_qp_idx = 0;
  gi->gdf_pic_scale_idx = 0;
  gi->gdf_block_size = AOMMAX(mib_size << MI_SIZE_LOG2, GDF_TEST_BLK_SIZE);
  gi->gdf_stripe_size = GDF_TEST_STRIPE_SIZE;
  gi->gdf_unit_size = GDF_TEST_STRIPE_SIZE;
  gi->gdf_vert_blks_per_tile[0] = 1 + ((rec_height - 1) / gi->gdf_block_size);
  gi->gdf_block_num_h = 1 + ((rec_height - 1) / gi->gdf_block_size);
  gi->gdf_horz_blks_per_tile[0] = 1 + ((rec_width - 1) / gi->gdf_block_size);
  gi->gdf_block_num_w = 1 + ((rec_width - 1) / gi->gdf_block_size);
  gi->gdf_block_num = gi->gdf_block_num_h * gi->gdf_block_num_w;
  gi->gdf_vert_stripes_per_tile[0] =
      gdf_num_stripes_in_tile(gi->gdf_stripe_size, rec_height);
  gi->err_height = gi->gdf_unit_size;
  gi->lap_stride = gi->gdf_unit_size + GDF_ERR_STRIDE_MARGIN;
  gi->cls_stride = (gi->gdf_unit_size >> 1) + GDF_ERR_STRIDE_MARGIN;
  gi->err_stride = gi->gdf_unit_size + GDF_ERR_STRIDE_MARGIN;
}

void init_gdf(AV1_COMMON *cm) {
  GdfInfo *gi = &cm->gdf_info;
  gi->gdf_mode = 0;
  gi->gdf_pic_qp_idx = 0;
  gi->gdf_pic_scale_idx = 0;
  gi->gdf_block_size = AOMMAX(cm->mib_size << MI_SIZE_LOG2, GDF_TEST_BLK_SIZE);
#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
  const int num_tile_rows =
      cm->seq_params.disable_loopfilters_across_tiles ? cm->tiles.rows : 1;
  const int num_tile_cols =
      cm->seq_params.disable_loopfilters_across_tiles ? cm->tiles.cols : 1;
#else
  const int num_tile_rows = 1;
  const int num_tile_cols = 1;
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES

  gi->gdf_stripe_size = GDF_TEST_STRIPE_SIZE;
  gi->gdf_unit_size = GDF_TEST_STRIPE_SIZE;
  // Calculate number of blocks
  gi->gdf_block_num_h = 0;
  gi->gdf_block_num_w = 0;
  if (num_tile_rows == 1 && num_tile_cols == 1) {
    AV1PixelRect tile_rect = av1_whole_frame_rect(cm, 0);
    const int tile_height = tile_rect.bottom - tile_rect.top;
    const int tile_width = tile_rect.right - tile_rect.left;
    gi->gdf_vert_blks_per_tile[0] =
        1 + ((tile_height - 1) / gi->gdf_block_size);
    gi->gdf_block_num_h += gi->gdf_vert_blks_per_tile[0];
    gi->gdf_horz_blks_per_tile[0] = 1 + ((tile_width - 1) / gi->gdf_block_size);
    gi->gdf_block_num_w += gi->gdf_horz_blks_per_tile[0];
    gi->gdf_vert_stripes_per_tile[0] =
        gdf_num_stripes_in_tile(gi->gdf_stripe_size, tile_height);
  } else {
    for (int tile_row = 0; tile_row < num_tile_rows; ++tile_row) {
      TileInfo tile_info;
      av1_tile_init(&tile_info, cm, tile_row, 0);
      AV1PixelRect tile_rect = av1_get_tile_rect(&tile_info, cm, 0);
      const int tile_height = tile_rect.bottom - tile_rect.top;
      gi->gdf_vert_blks_per_tile[tile_row] =
          1 + ((tile_height - 1) / gi->gdf_block_size);
      gi->gdf_block_num_h += gi->gdf_vert_blks_per_tile[tile_row];
      gi->gdf_vert_stripes_per_tile[tile_row] =
          gdf_num_stripes_in_tile(gi->gdf_stripe_size, tile_height);
    }
    for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) {
      TileInfo tile_info;
      av1_tile_init(&tile_info, cm, 0, tile_col);
      AV1PixelRect tile_rect = av1_get_tile_rect(&tile_info, cm, 0);
      const int tile_width = tile_rect.right - tile_rect.left;
      gi->gdf_horz_blks_per_tile[tile_col] =
          1 + ((tile_width - 1) / gi->gdf_block_size);
      gi->gdf_block_num_w += gi->gdf_horz_blks_per_tile[tile_col];
    }
  }
  gi->gdf_block_num = gi->gdf_block_num_h * gi->gdf_block_num_w;
  gi->err_height = gi->gdf_unit_size;
  gi->lap_stride = gi->gdf_unit_size + GDF_ERR_STRIDE_MARGIN;
  gi->cls_stride = (gi->gdf_unit_size >> 1) + GDF_ERR_STRIDE_MARGIN;
  gi->err_stride = gi->gdf_unit_size + GDF_ERR_STRIDE_MARGIN;
}

void alloc_gdf_buffers(GdfInfo *gi) {
  free_gdf_buffers(gi);
  gi->lap_ptr =
      (uint16_t **)aom_malloc(GDF_NET_INP_GRD_NUM * sizeof(uint16_t *));
  const int lap_buf_height = (gi->err_height >> 1) + 2;
  const int cls_buf_height = (gi->err_height >> 1) + 2;
  for (int i = 0; i < GDF_NET_INP_GRD_NUM; i++) {
    gi->lap_ptr[i] = (uint16_t *)aom_memalign(
        32, lap_buf_height * gi->lap_stride * sizeof(uint16_t));
    memset(gi->lap_ptr[i], 0,
           lap_buf_height * gi->lap_stride * sizeof(uint16_t));
  }
  gi->cls_ptr = (uint32_t *)aom_memalign(
      32, cls_buf_height * gi->cls_stride * sizeof(uint32_t));
  memset(gi->cls_ptr, 0, cls_buf_height * gi->cls_stride * sizeof(uint32_t));
  gi->err_ptr = (int16_t *)aom_memalign(
      32, gi->err_height * gi->err_stride * sizeof(int16_t));
  memset(gi->err_ptr, 0, gi->err_height * gi->err_stride * sizeof(int16_t));
  gi->gdf_block_flags = (int32_t *)aom_malloc(gi->gdf_block_num * sizeof(int));
  memset(gi->gdf_block_flags, 0, gi->gdf_block_num * sizeof(int));
#if CONFIG_GDF_IMPROVEMENT
  gi->glbs = (GDFLineBuffers *)aom_malloc(sizeof(GDFLineBuffers));
#endif
#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
  gi->tmp_save_left = (uint16_t *)aom_malloc(
      (gi->gdf_unit_size + 2 * GDF_TEST_EXTRA_HOR_BORDER) *
      GDF_TEST_EXTRA_VER_BORDER * sizeof(*gi->tmp_save_left));
  gi->tmp_save_right = (uint16_t *)aom_malloc(
      (gi->gdf_unit_size + 2 * GDF_TEST_EXTRA_HOR_BORDER) *
      GDF_TEST_EXTRA_VER_BORDER * sizeof(*gi->tmp_save_right));
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
}

void free_gdf_buffers(GdfInfo *gi) {
  if (gi->lap_ptr != NULL) {
    for (int i = 0; i < GDF_NET_INP_GRD_NUM; i++) {
      aom_free(gi->lap_ptr[i]);
      gi->lap_ptr[i] = NULL;
    }
    aom_free(gi->lap_ptr);
    gi->lap_ptr = NULL;
  }
  if (gi->cls_ptr != NULL) {
    aom_free(gi->cls_ptr);
    gi->cls_ptr = NULL;
  }
  if (gi->err_ptr != NULL) {
    aom_free(gi->err_ptr);
    gi->err_ptr = NULL;
  }
  if (gi->gdf_block_flags != NULL) {
    aom_free(gi->gdf_block_flags);
    gi->gdf_block_flags = NULL;
  }
#if CONFIG_GDF_IMPROVEMENT
  if (gi->glbs != NULL) {
    aom_free(gi->glbs);
    gi->glbs = NULL;
  }
#endif
#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
  if (gi->tmp_save_left != NULL) {
    aom_free(gi->tmp_save_left);
    gi->tmp_save_left = NULL;
  }
  if (gi->tmp_save_right != NULL) {
    aom_free(gi->tmp_save_right);
    gi->tmp_save_right = NULL;
  }
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
}

#define GDF_PRINT_INT(x) printf(#x " : %d\n", x)

void gdf_print_info(AV1_COMMON *cm, char *info, int poc) {
  printf("=================GDF %s info=================\n", info);

  GDF_PRINT_INT(cm->cur_frame->buf.y_width);
  GDF_PRINT_INT(cm->cur_frame->buf.y_height);
  GDF_PRINT_INT(cm->cur_frame->buf.y_stride);
  GDF_PRINT_INT(cm->cur_frame->buf.bit_depth);
  GDF_PRINT_INT(cm->quant_params.base_qindex);
  GDF_PRINT_INT(cm->ref_frames_info.ref_frame_distance[0]);
  GDF_PRINT_INT(cm->ref_frames_info.ref_frame_distance[1]);
  GDF_PRINT_INT(cm->current_frame.frame_type);
  GDF_PRINT_INT(cm->tiles.height);
  GDF_PRINT_INT(cm->tiles.width);
  GDF_PRINT_INT(cm->mib_size);

  printf("%s[%3d]: gdf_info = [ flag = %d ", info, poc, cm->gdf_info.gdf_mode);
  if (cm->gdf_info.gdf_mode > 0) {
    printf("=> (qp_idx, scale_idx) = (%3d %3d) ", cm->gdf_info.gdf_pic_qp_idx,
           cm->gdf_info.gdf_pic_scale_idx);
  }
  if (cm->gdf_info.gdf_mode > 1) {
    printf("(");
    for (int blk_idx = 0; blk_idx < cm->gdf_info.gdf_block_num; blk_idx++) {
      printf(" %d", cm->gdf_info.gdf_block_flags[blk_idx]);
    }
    printf(")");
  }
  printf(" ]\n");
}
#undef GDF_PRINT_INT

#if CONFIG_GDF_IMPROVEMENT

void gdf_extend_frame_highbd(uint16_t *data, int width, int height, int stride,
                             int border_horz, int border_vert) {
  uint16_t *data_p;
  int i, j;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
  }
  data_p = data - border_horz;
  for (i = -border_vert; i < 0; ++i) {
    memcpy(data_p + i * stride, data_p,
           (width + 2 * border_horz) * sizeof(uint16_t));
  }
  for (i = height; i < height + border_vert; ++i) {
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
           (width + 2 * border_horz) * sizeof(uint16_t));
  }
}

void gdf_copy_guided_frame(AV1_COMMON *cm) {
  int top_buf = GDF_TEST_EXTRA_VER_BORDER;
  int bot_buf = GDF_TEST_EXTRA_VER_BORDER;
  const int rec_height = cm->cur_frame->buf.y_height;
  const int rec_width = cm->cur_frame->buf.y_width;
  const int rec_stride = cm->cur_frame->buf.y_stride;

  const int input_stride = (((rec_width + GDF_TEST_STRIPE_SIZE) >> 4) << 4) +
                           16;  // GDF_TEST_STRIPE_SIZE: max unit size
                                // 16: AVX2 vector length
  cm->gdf_info.inp_stride = input_stride;

  cm->gdf_info.inp_pad_ptr =
      (uint16_t *)aom_memalign(32, (top_buf + rec_height + bot_buf + 4) *
                                       input_stride * sizeof(uint16_t));
  for (int i = top_buf; i < top_buf + rec_height; i++) {
    memcpy(
        cm->gdf_info.inp_pad_ptr + i * input_stride + GDF_TEST_EXTRA_HOR_BORDER,
        cm->cur_frame->buf.buffers[AOM_PLANE_Y] + (i - top_buf) * rec_stride,
        sizeof(uint16_t) * rec_width);
    if (cm->cur_frame->buf.bit_depth > GDF_TEST_INP_PREC) {
      const unsigned int diff_bit_depth =
          cm->cur_frame->buf.bit_depth - GDF_TEST_INP_PREC;
      uint16_t *cur_line = cm->gdf_info.inp_pad_ptr + i * input_stride +
                           GDF_TEST_EXTRA_HOR_BORDER;
      for (int j = 0; j < rec_width; j++) {
        cur_line[j] >>= diff_bit_depth;
      }
    }
  }
  cm->gdf_info.inp_ptr = cm->gdf_info.inp_pad_ptr + top_buf * input_stride +
                         GDF_TEST_EXTRA_HOR_BORDER;
  gdf_extend_frame_highbd(cm->gdf_info.inp_ptr, rec_width, rec_height,
                          input_stride, GDF_TEST_EXTRA_HOR_BORDER,
                          GDF_TEST_EXTRA_VER_BORDER);
}

#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
void gdf_setup_processing_stripe_leftright_boundary(GdfInfo *gdf, int i_min,
                                                    int i_max, int j_min,
                                                    int j_max,
                                                    int tile_boundary_left,
                                                    int tile_boundary_right) {
  const int data_stride = gdf->inp_stride;
  const int h = i_max - i_min;
  const int w = j_max - j_min;
  const int h_border = GDF_TEST_EXTRA_HOR_BORDER;
  const int v_border = GDF_TEST_EXTRA_VER_BORDER;
  const int stride = GDF_TEST_EXTRA_HOR_BORDER;
  assert(h <= RESTORATION_PROC_UNIT_SIZE);
  uint16_t *data_tl = gdf->inp_ptr + i_min * data_stride + j_min;
  if (tile_boundary_left) {
    uint16_t *d = data_tl - v_border * data_stride - h_border;
    for (int i = 0; i < v_border; ++i) {
      memcpy(gdf->tmp_save_left + i * stride, d + i * data_stride,
             h_border * sizeof(*d));
      // Replicate
      aom_memset16(d + i * data_stride, *(d + i * data_stride + h_border),
                   h_border);
    }
    for (int i = v_border; i < h + v_border; ++i) {
      memcpy(gdf->tmp_save_left + i * stride, d + i * data_stride,
             h_border * sizeof(*d));
      // Replicate
      aom_memset16(d + i * data_stride, *(d + i * data_stride + h_border),
                   h_border);
    }
    for (int i = h + v_border; i < h + 2 * v_border; ++i) {
      memcpy(gdf->tmp_save_left + i * stride, d + i * data_stride,
             h_border * sizeof(*d));
      // Replicate
      aom_memset16(d + i * data_stride, *(d + i * data_stride + h_border),
                   h_border);
    }
  }
  if (tile_boundary_right) {
    uint16_t *d = data_tl + w - v_border * data_stride;
    for (int i = 0; i < v_border; ++i) {
      memcpy(gdf->tmp_save_right + i * stride, d + i * data_stride,
             h_border * sizeof(*d));
      // Replicate
      aom_memset16(d + i * data_stride, *(d + i * data_stride - 1), h_border);
    }
    for (int i = v_border; i < h + v_border; ++i) {
      memcpy(gdf->tmp_save_right + i * stride, d + i * data_stride,
             h_border * sizeof(*d));
      // Replicate
      aom_memset16(d + i * data_stride, *(d + i * data_stride - 1), h_border);
    }
    for (int i = h + v_border; i < h + 2 * v_border; ++i) {
      memcpy(gdf->tmp_save_right + i * stride, d + i * data_stride,
             h_border * sizeof(*d));
      // Replicate
      aom_memset16(d + i * data_stride, *(d + i * data_stride - 1), h_border);
    }
  }
}

void gdf_restore_processing_stripe_leftright_boundary(GdfInfo *gdf, int i_min,
                                                      int i_max, int j_min,
                                                      int j_max,
                                                      int tile_boundary_left,
                                                      int tile_boundary_right) {
  const int data_stride = gdf->inp_stride;
  const int h = i_max - i_min;
  const int w = j_max - j_min;
  const int h_border = GDF_TEST_EXTRA_HOR_BORDER;
  const int v_border = GDF_TEST_EXTRA_VER_BORDER;
  const int stride = GDF_TEST_EXTRA_HOR_BORDER;
  assert(h <= RESTORATION_PROC_UNIT_SIZE);
  uint16_t *data_tl = gdf->inp_ptr + i_min * data_stride + j_min;
  if (tile_boundary_left) {
    uint16_t *d = data_tl - v_border * data_stride - h_border;
    for (int i = 0; i < h + 2 * v_border; ++i) {
      memcpy(d + i * data_stride, gdf->tmp_save_left + i * stride,
             h_border * sizeof(*d));
    }
  }
  if (tile_boundary_right) {
    uint16_t *d = data_tl + w - v_border * data_stride;
    for (int i = 0; i < h + 2 * v_border; ++i) {
      memcpy(d + i * data_stride, gdf->tmp_save_right + i * stride,
             h_border * sizeof(*d));
    }
  }
}
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES

void gdf_setup_reference_lines(AV1_COMMON *cm, int i_min, int i_max,
                               int frame_stripe) {
  const RestorationStripeBoundaries *rsb = &cm->rst_info[0].boundaries;
  assert(frame_stripe == gdf_get_frame_stripe_from_row(cm, i_min));
  const int rsb_row = frame_stripe * RESTORATION_CTX_VERT;

  const int rec_width = cm->cur_frame->buf.y_width;
  const int buf_x0_off = RESTORATION_BORDER_HORZ;
  const int buf_stride = rsb->stripe_boundary_stride;
  const int data_stride = cm->gdf_info.inp_stride;
  const int line_size = rec_width << 1;

  int copy_above = 1;
  int copy_below = 1;
  if (copy_above) {
    uint16_t *data_tl = cm->gdf_info.inp_ptr + i_min * data_stride;
    for (int i = -GDF_TEST_EXTRA_VER_BORDER; i < 0; ++i) {
      const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
      const int buf_off = buf_x0_off + buf_row * buf_stride;
      const uint16_t *buf = rsb->stripe_boundary_above + buf_off;
      uint16_t *dst = data_tl + i * data_stride;
      // Save old pixels, then replace with data from stripe_boundary_above
      memcpy(cm->gdf_info.glbs->gdf_save_above[i + GDF_TEST_EXTRA_VER_BORDER],
             dst - GDF_TEST_EXTRA_HOR_BORDER,
             line_size +
                 4 * GDF_TEST_EXTRA_HOR_BORDER);  // (sizeof(int16_t) * width +
                                                  // sizeof(int16_t) * 2 *
                                                  // GDF_TEST_EXTRA_HOR_BORDER
      memcpy(dst, buf, line_size);
      if (cm->cur_frame->buf.bit_depth > GDF_TEST_INP_PREC) {
        const unsigned int diff_bit_depth =
            cm->cur_frame->buf.bit_depth - GDF_TEST_INP_PREC;
        uint16_t *cur_line = dst;
        for (int j = 0; j < rec_width; j++) {
          cur_line[j] >>= diff_bit_depth;
        }
      }
      gdf_extend_frame_highbd(dst, rec_width, 1, data_stride,
                              GDF_TEST_EXTRA_HOR_BORDER, 0);
    }
  }
  if (copy_below) {
    uint16_t *data_bl = cm->gdf_info.inp_ptr + i_max * data_stride;
    for (int i = 0; i < GDF_TEST_EXTRA_VER_BORDER; ++i) {
      const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
      const int buf_off = buf_x0_off + buf_row * buf_stride;
      const uint16_t *src = rsb->stripe_boundary_below + buf_off;
      uint16_t *dst = data_bl + i * data_stride;
      // Save old pixels, then replace with data from stripe_boundary_below
      memcpy(cm->gdf_info.glbs->gdf_save_below[i],
             dst - GDF_TEST_EXTRA_HOR_BORDER,
             line_size + 4 * GDF_TEST_EXTRA_HOR_BORDER);
      memcpy(dst, src, line_size);
      if (cm->cur_frame->buf.bit_depth > GDF_TEST_INP_PREC) {
        const unsigned int diff_bit_depth =
            cm->cur_frame->buf.bit_depth - GDF_TEST_INP_PREC;
        uint16_t *cur_line = dst;
        for (int j = 0; j < rec_width; j++) {
          cur_line[j] >>= diff_bit_depth;
        }
      }
      gdf_extend_frame_highbd(dst, rec_width, 1, data_stride,
                              GDF_TEST_EXTRA_HOR_BORDER, 0);
    }
  }
}

void gdf_unset_reference_lines(AV1_COMMON *cm, int i_min, int i_max) {
  const int rec_width = cm->cur_frame->buf.y_width;
  const int data_stride = cm->gdf_info.inp_stride;
  const int line_size = rec_width << 1;

  int copy_above = 1;
  int copy_below = 1;
  copy_above = copy_below = 1;
  if (copy_above) {
    uint16_t *data_tl = cm->gdf_info.inp_ptr + i_min * data_stride;
    for (int i = -GDF_TEST_EXTRA_VER_BORDER; i < 0; ++i) {
      uint16_t *dst = data_tl + i * data_stride;
      memcpy(dst - GDF_TEST_EXTRA_HOR_BORDER,
             cm->gdf_info.glbs->gdf_save_above[i + GDF_TEST_EXTRA_VER_BORDER],
             line_size + 4 * GDF_TEST_EXTRA_HOR_BORDER);
    }
  }

  if (copy_below) {
    uint16_t *data_bl = cm->gdf_info.inp_ptr + i_max * data_stride;
    for (int i = 0; i < GDF_TEST_EXTRA_VER_BORDER; ++i) {
      uint16_t *dst = data_bl + i * data_stride;
      memcpy(dst - GDF_TEST_EXTRA_HOR_BORDER,
             cm->gdf_info.glbs->gdf_save_below[i],
             line_size + 4 * GDF_TEST_EXTRA_HOR_BORDER);
    }
  }
}

#else
void gdf_copy_guided_frame(AV1_COMMON *cm) {
  int top_buf = 3, bot_buf = 3;
  const int rec_height = cm->cur_frame->buf.y_height;
  const int rec_stride = cm->cur_frame->buf.y_stride;

  cm->gdf_info.inp_pad_ptr = (uint16_t *)aom_memalign(
      32, (top_buf + rec_height + bot_buf) * rec_stride * sizeof(uint16_t));
  for (int i = top_buf; i < top_buf + rec_height; i++) {
    for (int j = 0; j < rec_stride; j++) {
      cm->gdf_info.inp_pad_ptr[i * rec_stride + j] =
          cm->cur_frame->buf
              .buffers[AOM_PLANE_Y][(i - top_buf) * rec_stride + j];
    }
  }
  cm->gdf_info.inp_ptr = cm->gdf_info.inp_pad_ptr + top_buf * rec_stride;
}
#endif

void gdf_free_guided_frame(AV1_COMMON *cm) {
  aom_free(cm->gdf_info.inp_pad_ptr);
}

int gdf_get_block_idx(const AV1_COMMON *cm, int y_h, int y_w) {
  int blk_idx = -1;
  if ((y_h % cm->gdf_info.gdf_block_size == 0) &&
      (y_w % cm->gdf_info.gdf_block_size == 0)) {
    int blk_idx_h = y_h / cm->gdf_info.gdf_block_size;
    int blk_idx_w = y_w / cm->gdf_info.gdf_block_size;
    blk_idx = blk_idx_h * cm->gdf_info.gdf_block_num_w + blk_idx_w;
  }
  blk_idx = blk_idx < cm->gdf_info.gdf_block_num ? blk_idx : -1;
  return blk_idx;
}

static INLINE int get_ref_dst_max(const AV1_COMMON *const cm) {
  int ref_dst_max = 0;
  for (int i = 0; i < cm->ref_frames_info.num_future_refs; i++) {
    const int ref = cm->ref_frames_info.future_refs[i];
    if ((ref == 0 || ref == 1) && get_ref_frame_buf(cm, ref) != NULL) {
      ref_dst_max =
          AOMMAX(ref_dst_max, abs(cm->ref_frames_info.ref_frame_distance[ref]));
    }
  }
  for (int i = 0; i < cm->ref_frames_info.num_past_refs; i++) {
    const int ref = cm->ref_frames_info.past_refs[i];
    if ((ref == 0 || ref == 1) && get_ref_frame_buf(cm, ref) != NULL) {
      ref_dst_max =
          AOMMAX(ref_dst_max, abs(cm->ref_frames_info.ref_frame_distance[ref]));
    }
  }

  return ref_dst_max > 0 ? ref_dst_max : INT_MAX;
}

int gdf_get_ref_dst_idx(const AV1_COMMON *cm) {
  int ref_dst_idx = 0;
  if (frame_is_intra_only(cm)) return ref_dst_idx;

  int ref_dst_max = get_ref_dst_max(cm);
  if (ref_dst_max < 2)
    ref_dst_idx = 1;
  else if (ref_dst_max < 3)
    ref_dst_idx = 2;
  else if (ref_dst_max < 6)
    ref_dst_idx = 3;
  else if (ref_dst_max < 11)
    ref_dst_idx = 4;
  else
    ref_dst_idx = 5;
  return ref_dst_idx;
}

int gdf_get_qp_idx_base(const AV1_COMMON *cm) {
  const int is_intra = frame_is_intra_only(cm);
  const int bit_depth = cm->cur_frame->buf.bit_depth;
  int qp_base = is_intra ? 85 : 110;
  int qp_offset = 24 * (bit_depth - 8);
  int qp = cm->quant_params.base_qindex;
  int qp_idx_avg, qp_idx_base;
  if (qp < (qp_base + 12 + qp_offset))
    qp_idx_avg = 0;
  else if (qp < (qp_base + 37 + qp_offset))
    qp_idx_avg = 1;
  else if (qp < (qp_base + 62 + qp_offset))
    qp_idx_avg = 2;
  else if (qp < (qp_base + 87 + qp_offset))
    qp_idx_avg = 3;
  else if (qp < (qp_base + 112 + qp_offset))
    qp_idx_avg = 4;
  else
    qp_idx_avg = 5;
  qp_idx_base = CLIP(qp_idx_avg - (GDF_RDO_QP_NUM >> 1), 0,
                     GDF_TRAIN_QP_NUM - GDF_RDO_QP_NUM);
  return qp_idx_base;
}

void gdf_filter_frame(AV1_COMMON *cm) {
  uint16_t *const rec_pnt = cm->cur_frame->buf.buffers[AOM_PLANE_Y];
  const int rec_stride = cm->cur_frame->buf.y_stride;

  if (cm->bru.frame_inactive_flag) return;
  const int bit_depth = cm->cur_frame->buf.bit_depth;
  const int pxl_max = (1 << cm->cur_frame->buf.bit_depth) - 1;
  const int pxl_shift =
      GDF_TEST_INP_PREC - AOMMIN(bit_depth, GDF_TEST_INP_PREC);
  const int err_shift = GDF_RDO_SCALE_NUM_LOG2 + GDF_TEST_INP_PREC - bit_depth;
  int ref_dst_idx = gdf_get_ref_dst_idx(cm);
  int qp_idx_min = gdf_get_qp_idx_base(cm) + cm->gdf_info.gdf_pic_qp_idx;
  int qp_idx_max_plus_1 = qp_idx_min + 1;
  int scale_val = cm->gdf_info.gdf_pic_scale_idx + 1;

#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
  const int num_tile_rows =
      cm->seq_params.disable_loopfilters_across_tiles ? cm->tiles.rows : 1;
  const int num_tile_cols =
      cm->seq_params.disable_loopfilters_across_tiles ? cm->tiles.cols : 1;
#else
  const int num_tile_rows = 1;
  const int num_tile_cols = 1;
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
  AV1PixelRect tile_rect = av1_whole_frame_rect(cm, 0);
  int blk_idx = 0;
  int tile_blk_stripe0 = 0;
  for (int tile_row = 0; tile_row < num_tile_rows; ++tile_row) {
#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
    if (cm->seq_params.disable_loopfilters_across_tiles) {
      TileInfo tile_info;
      av1_tile_init(&tile_info, cm, tile_row, 0);
      tile_rect = av1_get_tile_rect(&tile_info, cm, 0);
    }
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
    const int tile_height = tile_rect.bottom - tile_rect.top;
    for (int y_pos = -GDF_TEST_STRIPE_OFF, blk_idx_h = 0; y_pos < tile_height;
         y_pos += cm->gdf_info.gdf_block_size, blk_idx_h++) {
      if (blk_idx_h == cm->gdf_info.gdf_vert_blks_per_tile[tile_row]) {
        blk_idx -= cm->gdf_info.gdf_block_num_w;
      }
      int blk_stripe = 0;
      for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) {
#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
        if (cm->seq_params.disable_loopfilters_across_tiles) {
          TileInfo tile_info;
          av1_tile_init(&tile_info, cm, tile_row, tile_col);
          tile_rect = av1_get_tile_rect(&tile_info, cm, 0);
        }
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
        const int tile_width = tile_rect.right - tile_rect.left;
        for (int x_pos = 0; x_pos < tile_width;
             x_pos += cm->gdf_info.gdf_block_size) {
          blk_stripe = 0;
          for (int v_pos = y_pos; v_pos < y_pos + cm->gdf_info.gdf_block_size &&
                                  v_pos < tile_height;
               v_pos += cm->gdf_info.gdf_unit_size) {
            int i_min =
                AOMMAX(v_pos, GDF_TEST_FRAME_BOUNDARY_SIZE) + tile_rect.top;
            int i_max = AOMMIN(v_pos + cm->gdf_info.gdf_unit_size,
                               tile_height - GDF_TEST_FRAME_BOUNDARY_SIZE) +
                        tile_rect.top;
#if CONFIG_GDF_IMPROVEMENT && (GDF_TEST_VIRTUAL_BOUNDARY == 2)
            gdf_setup_reference_lines(cm, i_min, i_max,
                                      tile_blk_stripe0 + blk_stripe);
#endif
            for (int u_pos = x_pos;
                 u_pos < x_pos + cm->gdf_info.gdf_block_size &&
                 u_pos < tile_width;
                 u_pos += cm->gdf_info.gdf_unit_size) {
              int j_min =
                  AOMMAX(u_pos, GDF_TEST_FRAME_BOUNDARY_SIZE) + tile_rect.left;
              int j_max = AOMMIN(u_pos + cm->gdf_info.gdf_unit_size,
                                 tile_width - GDF_TEST_FRAME_BOUNDARY_SIZE) +
                          tile_rect.left;
#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
              int tile_boundary_left = (j_min == tile_rect.left);
              int tile_boundary_right = (j_max == tile_rect.right);
              gdf_setup_processing_stripe_leftright_boundary(
                  &cm->gdf_info, i_min, i_max, j_min, j_max, tile_boundary_left,
                  tile_boundary_right);
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES

              int use_gdf_local = 1;
              // FU level skip
              if (cm->bru.enabled) {
                const int mbmi_idx = get_mi_grid_idx(
                    &cm->mi_params,
                    AOMMIN(i_max - 1, (i_min + GDF_TEST_STRIPE_OFF)) >>
                        MI_SIZE_LOG2,
                    j_min >> MI_SIZE_LOG2);
                use_gdf_local =
                    cm->mi_params.mi_grid_base[mbmi_idx]->local_gdf_mode;
              }
              use_gdf_local &=
                  gdf_block_adjust_and_validate(&i_min, &i_max, &j_min, &j_max);
              if ((cm->gdf_info.gdf_mode == 1 ||
                   cm->gdf_info.gdf_block_flags[blk_idx]) &&
                  use_gdf_local) {
                const int bru_blk_skip = !bru_is_sb_active(
                    cm, j_min >> MI_SIZE_LOG2,
                    AOMMIN(i_max - 1, (i_min + GDF_TEST_STRIPE_OFF)) >>
                        MI_SIZE_LOG2);
                if (cm->bru.enabled && bru_blk_skip) {
                  aom_internal_error(&cm->error, AOM_CODEC_ERROR,
                                     "GDF on not active SB");
                }
                for (int qp_idx = qp_idx_min; qp_idx < qp_idx_max_plus_1;
                     qp_idx++) {
#if CONFIG_GDF_IMPROVEMENT
                  gdf_set_lap_and_cls_unit(
                      i_min, i_max, j_min, j_max, cm->gdf_info.gdf_stripe_size,
                      cm->gdf_info.inp_ptr + cm->gdf_info.inp_stride * i_min +
                          j_min,
                      cm->gdf_info.inp_stride, bit_depth, cm->gdf_info.lap_ptr,
                      cm->gdf_info.lap_stride, cm->gdf_info.cls_ptr,
                      cm->gdf_info.cls_stride);
                  gdf_inference_unit(
                      i_min, i_max, j_min, j_max, cm->gdf_info.gdf_stripe_size,
                      qp_idx,
                      cm->gdf_info.inp_ptr + cm->gdf_info.inp_stride * i_min +
                          j_min,
                      cm->gdf_info.inp_stride, cm->gdf_info.lap_ptr,
                      cm->gdf_info.lap_stride, cm->gdf_info.cls_ptr,
                      cm->gdf_info.cls_stride, cm->gdf_info.err_ptr,
                      cm->gdf_info.err_stride, pxl_shift, ref_dst_idx);
#else
                  gdf_set_lap_and_cls_unit(
                      i_min, i_max, j_min, j_max, cm->gdf_info.gdf_stripe_size,
                      cm->gdf_info.inp_ptr + rec_stride * i_min + j_min,
                      rec_stride, bit_depth, cm->gdf_info.lap_ptr,
                      cm->gdf_info.lap_stride, cm->gdf_info.cls_ptr,
                      cm->gdf_info.cls_stride);
                  gdf_inference_unit(
                      i_min, i_max, j_min, j_max, cm->gdf_info.gdf_stripe_size,
                      qp_idx, cm->gdf_info.inp_ptr + rec_stride * i_min + j_min,
                      rec_stride, cm->gdf_info.lap_ptr, cm->gdf_info.lap_stride,
                      cm->gdf_info.cls_ptr, cm->gdf_info.cls_stride,
                      cm->gdf_info.err_ptr, cm->gdf_info.err_stride, pxl_shift,
                      ref_dst_idx);
#endif
#if CONFIG_DISABLE_LOOP_FILTERS_LOSSLESS
                  // If there is at-least 1 segment is lossless in a frame, we
                  // have to do 4x4 processing, because minimum lossless block
                  // can be 4x4 size. Although, regardless the value of
                  // cm->features.has_lossless_segment, we can always do 4x4
                  // processing, however, for software optimization purpose we
                  // have used  full block processing for whole lossy frame.
                  if (cm->features.has_lossless_segment) {
                    // 4x4 block level processing
                    int min_b_size = 1 << MI_SIZE_LOG2;
                    for (int i_pos_4x4 = i_min; i_pos_4x4 < i_max;
                         i_pos_4x4 += min_b_size) {
                      for (int j_pos_4x4 = j_min; j_pos_4x4 < j_max;
                           j_pos_4x4 += min_b_size) {
                        // CHECK_LOSSLESS(j_pos_4x4 % 4, " j_pos_4x4 is not
                        // multiple of 4"); CHECK_LOSSLESS(i_pos_4x4 % 4, "
                        // i_pos_4x4 is not multiple of 4");

                        const int mi_idx = get_mi_grid_idx(
                            &cm->mi_params, i_pos_4x4 >> MI_SIZE_LOG2,
                            j_pos_4x4 >> MI_SIZE_LOG2);
                        const int is_lossless =
                            cm->features
                                .lossless_segment[cm->mi_params
                                                      .mi_grid_base[mi_idx]
                                                      ->segment_id];
                        if (!is_lossless) {
                          int height_4x4 =
                              AOMMIN(min_b_size, i_max - i_pos_4x4);
                          int width_4x4 = AOMMIN(min_b_size, j_max - j_pos_4x4);
                          uint16_t *rec_pnt_4x4 =
                              rec_pnt + i_pos_4x4 * rec_stride + j_pos_4x4;
                          int16_t *errPnt =
                              cm->gdf_info.err_ptr +
                              (i_pos_4x4 - i_min) * cm->gdf_info.err_stride +
                              (j_pos_4x4 - j_min);
                          gdf_compensation_unit_c(
                              rec_pnt_4x4, rec_stride, errPnt,
                              cm->gdf_info.err_stride, err_shift, scale_val,
                              pxl_max, height_4x4, width_4x4);
                        }
                      }
                    }
                  } else {
#endif  // CONFIG_DISABLE_LOOP_FILTERS_LOSSLESS
                    gdf_compensation_unit(rec_pnt + i_min * rec_stride + j_min,
                                          rec_stride, cm->gdf_info.err_ptr,
                                          cm->gdf_info.err_stride, err_shift,
                                          scale_val, pxl_max, i_max - i_min,
                                          j_max - j_min);
#if CONFIG_DISABLE_LOOP_FILTERS_LOSSLESS
                  }
#endif  // CONFIG_DISABLE_LOOP_FILTERS_LOSSLESS
                }
              }
#if CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
              gdf_restore_processing_stripe_leftright_boundary(
                  &cm->gdf_info, i_min, i_max, j_min, j_max, tile_boundary_left,
                  tile_boundary_right);
#endif  // CONFIG_CONTROL_LOOPFILTERS_ACROSS_TILES
            }  // u_pos
#if CONFIG_GDF_IMPROVEMENT && (GDF_TEST_VIRTUAL_BOUNDARY == 2)
            gdf_unset_reference_lines(cm, i_min, i_max);
#endif
            blk_stripe++;
          }  // v_pos
          blk_idx++;
        }  // x_pos
      }  // tile_col
      tile_blk_stripe0 += blk_stripe;
    }  // y_pos
  }  // tile_row
}

#endif  // AOM_COMMON_GDF_H_
