| /* |
| * Copyright (c) 2023, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include <cstdio> |
| #include <memory> |
| #include <vector> |
| #include <assert.h> |
| |
| #include "av1/common/resize.h" |
| #include "common/tools_common.h" |
| |
| #include "aom_dsp/aom_dsp_common.h" |
| #include "aom_dsp/flow_estimation/deepflow.h" |
| #include "aom_mem/aom_mem.h" |
| |
| #include "config/aom_dsp_rtcd.h" |
| |
| #include "av1/common/resize.h" |
| #include "common/tf_lite_includes.h" |
| |
| #define NUM_THREADS 8 |
| #define USE_XNNPACK 1 |
| #define USE_DYNAMIC_MODEL 0 |
| |
| namespace { |
| |
| #if USE_DYNAMIC_MODEL |
| #include "examples/deep_flow/pwcnet_l7_no_dense_s4_static_autoflow_ft_dynamic_16f.h" |
| #define MODEL_DATA pwcnet_l7_no_dense_s4_static_autoflow_ft_dynamic_16f |
| #else |
| #include "examples/deep_flow/pwcnet_l7_no_dense_s4_static_autoflow_ft_384x512_16f.h" |
| #define MODEL_DATA pwcnet_l7_no_dense_s4_static_autoflow_ft_384x512_16f |
| #endif // USE_DYNAMIC_MODEL |
| |
| // Tensor indices for the Resampler custom op. |
| constexpr int kInputTensorSourceIndex = 0; |
| constexpr int kInputTensorWarpIndex = 1; |
| constexpr int kOutputTensorDestinationIndex = 0; |
| |
| const std::string kImage1_tensor_name = "serving_default_image0:0"; |
| const std::string kImage2_tensor_name = "serving_default_image1:0"; |
| const std::string kOutput_tensor_name = "StatefulPartitionedCall:6"; |
| |
| // A Prepare function for the Resampler custom op. |
| // Checks dimensions and types of inputs and outputs of the node. |
| TfLiteStatus ResamplerPrepare(TfLiteContext *context, TfLiteNode *node) { |
| TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 2); |
| TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1); |
| |
| const TfLiteTensor *source = |
| ::tflite::GetInput(context, node, kInputTensorSourceIndex); |
| TF_LITE_ENSURE(context, source != nullptr); |
| TF_LITE_ENSURE_EQ(context, ::tflite::NumDimensions(source), 4); |
| TF_LITE_ENSURE_EQ(context, source->type, kTfLiteFloat32); |
| |
| const TfLiteTensor *warp = |
| ::tflite::GetInput(context, node, kInputTensorWarpIndex); |
| TF_LITE_ENSURE(context, warp != nullptr); |
| TF_LITE_ENSURE_EQ(context, ::tflite::NumDimensions(warp), 4); |
| TF_LITE_ENSURE_EQ(context, warp->type, kTfLiteFloat32); |
| TF_LITE_ENSURE_EQ(context, warp->dims->data[3], 2); |
| |
| TfLiteTensor *output = |
| ::tflite::GetOutput(context, node, kOutputTensorDestinationIndex); |
| TF_LITE_ENSURE(context, output != nullptr); |
| TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32); |
| TfLiteIntArray *output_size = TfLiteIntArrayCreate(4); |
| output_size->data[0] = source->dims->data[0]; |
| output_size->data[1] = source->dims->data[1]; |
| output_size->data[2] = source->dims->data[2]; |
| output_size->data[3] = source->dims->data[3]; |
| if (context->ResizeTensor(context, output, output_size) != kTfLiteOk) { |
| return kTfLiteError; |
| } |
| return kTfLiteOk; |
| } |
| |
| static void remap_pixel_bilinear(float *dst, float *src, float map_x, |
| float map_y, int d, int width, int height) { |
| int x0 = (int)floor(map_x); |
| int y0 = (int)floor(map_y); |
| int x1 = x0 + 1; |
| int y1 = y0 + 1; |
| float alphax = map_x - x0; |
| float alphay = map_y - y0; |
| float m00 = (float)(1.0 - alphax) * (float)(1.0 - alphay); |
| float m01 = (float)(1.0 - alphax) * (alphay); |
| float m10 = (alphax) * (float)(1.0 - alphay); |
| float m11 = (alphax) * (alphay); |
| x0 = (x0 < 0 ? 0 : x0 >= width ? width - 1 : x0); |
| y0 = (y0 < 0 ? 0 : y0 >= height ? height - 1 : y0); |
| x1 = (x1 < 0 ? 0 : x1 >= width ? width - 1 : x1); |
| y1 = (y1 < 0 ? 0 : y1 >= height ? height - 1 : y1); |
| for (int c = 0; c < d; ++c) { |
| const float v00 = src[y0 * width * d + x0 * d + c]; |
| const float v10 = src[y0 * width * d + x1 * d + c]; |
| const float v01 = src[y1 * width * d + x0 * d + c]; |
| const float v11 = src[y1 * width * d + x1 * d + c]; |
| dst[c] = m00 * v00 + m10 * v10 + m01 * v01 + m11 * v11; |
| } |
| } |
| |
| // Eval function for the custom Resampler op. |
| TfLiteStatus ResamplerEval(TfLiteContext *context, TfLiteNode *node) { |
| const TfLiteTensor *src = |
| ::tflite::GetInput(context, node, kInputTensorSourceIndex); |
| const TfLiteTensor *warp = |
| ::tflite::GetInput(context, node, kInputTensorWarpIndex); |
| const TfLiteTensor *dst = |
| ::tflite::GetOutput(context, node, kOutputTensorDestinationIndex); |
| TF_LITE_ENSURE(context, src != nullptr); |
| TF_LITE_ENSURE(context, warp != nullptr); |
| TF_LITE_ENSURE(context, dst != nullptr); |
| float *src_data = reinterpret_cast<float *>(src->data.data); |
| float *warp_data = reinterpret_cast<float *>(warp->data.data); |
| float *dst_data = reinterpret_cast<float *>(dst->data.data); |
| |
| const int b = src->dims->data[0]; |
| const int h = src->dims->data[1]; |
| const int w = src->dims->data[2]; |
| const int d = src->dims->data[3]; |
| for (int batch = 0; batch < b; ++batch) { |
| const size_t data_offset = h * w * d * batch; |
| const int warp_offset = h * w * 2 * batch; |
| float *src_batch = src_data + data_offset; |
| float *dst_batch = dst_data + data_offset; |
| float *warp_batch = warp_data + warp_offset; |
| for (int i = 0; i < h * w; ++i) { |
| remap_pixel_bilinear(dst_batch, src_batch, warp_batch[0], warp_batch[1], |
| d, w, h); |
| dst_batch += d; |
| warp_batch += 2; |
| } |
| } |
| return kTfLiteOk; |
| } |
| |
| // Custom operation implementation. |
| TfLiteRegistration *ResamplerOp() { |
| static TfLiteRegistration reg = { |
| /*.init=*/ |
| [](TfLiteContext *, const char *, size_t) -> void * { |
| return new TfLitePaddingValues(); |
| }, |
| /*.free=*/ |
| [](TfLiteContext *, void *buffer) -> void { |
| delete reinterpret_cast<TfLitePaddingValues *>(buffer); |
| }, |
| /*.prepare=*/ResamplerPrepare, |
| /*.invoke=*/ResamplerEval, |
| /*.profiling_string=*/nullptr, |
| /*.builtin_code=*/0, |
| /*.custom_name=*/"Resampler.", 0 |
| }; |
| return ® |
| } |
| |
| static TfLiteDelegate *get_tflite_xnnpack_delegate(int num_threads) { |
| TfLiteXNNPackDelegateOptions xnnpack_options = |
| TfLiteXNNPackDelegateOptionsDefault(); |
| xnnpack_options.num_threads = AOMMAX(num_threads, 1); |
| return TfLiteXNNPackDelegateCreate(&xnnpack_options); |
| } |
| |
| static void get_input_tensor_indices( |
| std::unique_ptr<tflite::Interpreter> &interpreter, int *image1_tensor_index, |
| int *image2_tensor_index) { |
| *image1_tensor_index = -1; |
| *image2_tensor_index = -1; |
| for (int i = 0; i < (int)interpreter->inputs().size(); ++i) { |
| const auto name = interpreter->GetInputName(i); |
| if (name == kImage1_tensor_name) { |
| *image1_tensor_index = i; |
| } else if (name == kImage2_tensor_name) { |
| *image2_tensor_index = i; |
| } |
| } |
| } |
| |
| static void get_output_tensor_index( |
| std::unique_ptr<tflite::Interpreter> &interpreter, |
| int *output_tensor_index) { |
| *output_tensor_index = -1; |
| for (int i = 0; i < (int)interpreter->outputs().size(); ++i) { |
| const auto name = interpreter->GetOutputName(i); |
| if (name == kOutput_tensor_name) { |
| *output_tensor_index = i; |
| } |
| } |
| } |
| |
| // Builds and returns the TFlite interpreter. |
| static std::unique_ptr<tflite::Interpreter> get_tflite_interpreter( |
| int width, int height, int num_threads, TfLiteDelegate *xnnpack_delegate) { |
| (void)width; |
| (void)height; |
| const unsigned char *const model_tflite_data = MODEL_DATA; |
| |
| auto model = tflite::GetModel(model_tflite_data); |
| if (model == nullptr) return nullptr; |
| |
| tflite::ops::builtin::BuiltinOpResolver resolver; |
| resolver.AddCustom("Resampler", ResamplerOp()); |
| tflite::InterpreterBuilder builder(model, resolver); |
| std::unique_ptr<tflite::Interpreter> interpreter; |
| builder(&interpreter); |
| |
| tflite::ErrorReporter *reporter = tflite::DefaultErrorReporter(); |
| |
| if (xnnpack_delegate) { |
| if (interpreter->ModifyGraphWithDelegate(xnnpack_delegate) != kTfLiteOk) { |
| reporter->Report("Failed at modifying graph with XNNPack delegate"); |
| return nullptr; |
| } |
| } |
| #if USE_DYNAMIC_MODEL |
| // We only need to resize the input tensors. All other tensors (including |
| // output tensor) will be resized automatically. |
| // Dimension order: batch_size, height, width, num_channels. |
| // Note: height comes before width here! |
| const std::vector<int> in_dims = { 1, height, width, 3 }; |
| int image1_tensor_index = -1; |
| int image2_tensor_index = -1; |
| get_input_tensor_indices(interpreter, &image1_tensor_index, |
| &image2_tensor_index); |
| printf("input indices %d %d\n", image1_tensor_index, image2_tensor_index); |
| if (interpreter->ResizeInputTensor(interpreter->inputs()[image1_tensor_index], |
| in_dims) != kTfLiteOk) { |
| reporter->Report("Failed at input tensor resize"); |
| return nullptr; |
| } |
| if (interpreter->ResizeInputTensor(interpreter->inputs()[image2_tensor_index], |
| in_dims) != kTfLiteOk) { |
| reporter->Report("Failed at input tensor resize"); |
| return nullptr; |
| } |
| #endif // USE_DYNAMIC_MODEL |
| |
| if (interpreter->AllocateTensors() != kTfLiteOk) { |
| reporter->Report("Failed at tensor allocation"); |
| return nullptr; |
| } |
| interpreter->SetNumThreads(AOMMAX(num_threads, 1)); |
| return interpreter; |
| } |
| |
| } // namespace |
| |
| static int fill_input_tensor_highbd(const uint16_t *image, int width, |
| int height, int stride, |
| TfLiteTensor *tensor, int bit_depth) { |
| if (tensor->type != kTfLiteFloat32) { |
| printf("Expected float32 inputs.\n"); |
| return 0; |
| } |
| if (tensor->dims->size != 4) { |
| printf("Expected 4 dimensional inputs.\n"); |
| return 0; |
| } |
| if (tensor->dims->data[0] != 1) { |
| printf("Expected batch size 1.\n"); |
| return 0; |
| } |
| if (tensor->dims->data[3] != 3) { |
| printf("Expected RGB inputs.\n"); |
| return 0; |
| } |
| |
| const int h = tensor->dims->data[1]; |
| const int w = tensor->dims->data[2]; |
| uint16_t *image_resized = (uint16_t *)malloc(h * w * sizeof(*image_resized)); |
| av1_highbd_resize_plane(image, height, width, stride, image_resized, h, w, w, |
| bit_depth); |
| |
| float *data = static_cast<float *>(tensor->data.data); |
| const float kScale = (float)2.0 / ((1 << bit_depth) - 1); |
| const float kOffset = -1.f; |
| for (int y = 0; y < h; ++y) { |
| for (int x = 0; x < w; ++x) { |
| const float v = image_resized[y * w + x] * kScale + kOffset; |
| *data++ = v; |
| *data++ = v; |
| *data++ = v; |
| } |
| } |
| free(image_resized); |
| return 1; |
| } |
| |
| static int extract_output_flow(TfLiteTensor *tensor, double *flow_x, |
| double *flow_y, int flow_width, int flow_height, |
| int flow_stride) { |
| static const double output_scale = 20.f; |
| if (tensor->type != kTfLiteFloat32) { |
| fprintf(stderr, "Expected float32 output.\n"); |
| return 0; |
| } |
| if (tensor->dims->size != 4) { |
| fprintf(stderr, "Expected 4 dimensional output.\n"); |
| return 0; |
| } |
| if (tensor->dims->data[0] != 1) { |
| fprintf(stderr, "Expected batch size 1.\n"); |
| return 0; |
| } |
| if (tensor->dims->data[3] != 2) { |
| fprintf(stderr, "Expected 2-channel output.\n"); |
| return 0; |
| } |
| |
| const int h = tensor->dims->data[1]; |
| const int w = tensor->dims->data[2]; |
| const float *tensor_data = static_cast<float *>(tensor->data.data); |
| |
| double *flow_x_model_res = |
| (double *)malloc(h * w * sizeof(*flow_x_model_res)); |
| double *flow_y_model_res = |
| (double *)malloc(h * w * sizeof(*flow_y_model_res)); |
| for (int y = 0; y < h; ++y) { |
| for (int x = 0; x < w; ++x) { |
| flow_x_model_res[y * w + x] = (double)*tensor_data++; |
| flow_y_model_res[y * w + x] = (double)*tensor_data++; |
| } |
| } |
| printf("Computed flow at size %dx%d\n", w, h); |
| |
| av1_resize_plane_double(flow_x_model_res, h, w, w, flow_x, flow_height, |
| flow_width, flow_stride); |
| av1_resize_plane_double(flow_y_model_res, h, w, w, flow_y, flow_height, |
| flow_width, flow_stride); |
| const double x_scale = output_scale * static_cast<double>(flow_width) / w; |
| const double y_scale = output_scale * static_cast<double>(flow_height) / h; |
| for (int i = 0; i < flow_height; ++i) { |
| for (int j = 0; j < flow_width; ++j) { |
| flow_x[i * flow_stride + j] *= x_scale; |
| flow_y[i * flow_stride + j] *= y_scale; |
| } |
| } |
| printf("Resized flow to size %dx%d\n", flow_width, flow_height); |
| return 1; |
| } |
| |
| extern "C" FlowField *aom_compute_deepflow_field(YV12_BUFFER_CONFIG *src, |
| YV12_BUFFER_CONFIG *ref, |
| int bit_depth) { |
| // Precompute information we will need about each frame |
| const int src_width = src->y_crop_width; |
| const int src_height = src->y_crop_height; |
| const int src_stride = src->y_stride; |
| const int ref_width = ref->y_crop_width; |
| const int ref_height = ref->y_crop_height; |
| const int ref_stride = ref->y_stride; |
| assert(ref_width == src_width); |
| assert(ref_height == src_height); |
| |
| FlowField *flow = aom_alloc_flow_field(src_width, src_height); |
| if (!flow) return NULL; |
| |
| // Compute flow over a cropped region to ensure output field is center-aligned |
| // with blocks of size DOWNSAMPLE_FACTOR x DOWNSAMPLE_FACTOR. |
| const int use_width = flow->width << DOWNSAMPLE_SHIFT; |
| const int use_height = flow->height << DOWNSAMPLE_SHIFT; |
| |
| uint16_t *src_buf = src->y_buffer; |
| uint16_t *ref_buf = ref->y_buffer; |
| |
| static const int use_xnnpack = USE_XNNPACK; |
| TfLiteDelegate *xnnpack_delegate = |
| use_xnnpack ? get_tflite_xnnpack_delegate(NUM_THREADS) : nullptr; |
| std::unique_ptr<tflite::Interpreter> interpreter = get_tflite_interpreter( |
| use_width, use_height, NUM_THREADS, xnnpack_delegate); |
| |
| if (interpreter == nullptr) { |
| aom_free_flow_field(flow); |
| return NULL; |
| } |
| |
| int image1_tensor_index = -1; |
| int image2_tensor_index = -1; |
| get_input_tensor_indices(interpreter, &image1_tensor_index, |
| &image2_tensor_index); |
| int output_tensor_index = -1; |
| get_output_tensor_index(interpreter, &output_tensor_index); |
| |
| // Prepare input. |
| if (!fill_input_tensor_highbd(src_buf, use_width, use_height, src_stride, |
| interpreter->input_tensor(image1_tensor_index), |
| bit_depth)) { |
| fprintf(stderr, "Could not load image1 input tensor.\n"); |
| aom_free_flow_field(flow); |
| return NULL; |
| } |
| if (!fill_input_tensor_highbd(ref_buf, use_width, use_height, ref_stride, |
| interpreter->input_tensor(image2_tensor_index), |
| bit_depth)) { |
| fprintf(stderr, "Could not load image2 input tensor.\n"); |
| aom_free_flow_field(flow); |
| return NULL; |
| } |
| |
| // Invoke TFlite inference. |
| tflite::ErrorReporter *reporter = tflite::DefaultErrorReporter(); |
| auto status = interpreter->Invoke(); |
| if (status != kTfLiteOk) { |
| reporter->Report("Failed at interpreter invocation"); |
| aom_free_flow_field(flow); |
| return NULL; |
| } |
| |
| if (!extract_output_flow(interpreter->output_tensor(output_tensor_index), |
| flow->u, flow->v, flow->width, flow->height, |
| flow->stride)) { |
| fprintf(stderr, "Could not extract output flow tensor.\n"); |
| aom_free_flow_field(flow); |
| return NULL; |
| } |
| // IMPORTANT: release the interpreter before destroying the delegate. |
| interpreter.reset(); |
| if (xnnpack_delegate) TfLiteXNNPackDelegateDelete(xnnpack_delegate); |
| |
| return flow; |
| } |