From 1a5cd79f518268203c4fff7c664b387e5720f1f4 Mon Sep 17 00:00:00 2001 From: Mohamed Khaled Mohamed Date: Tue, 30 Aug 2022 21:13:27 +0200 Subject: [PATCH] avfilter: add bilateral_cuda filter GSoC 2022 Signed-off-by: Mohamed Khaled Signed-off-by: Timo Rothenpieler --- Changelog | 1 + compat/cuda/cuda_runtime.h | 1 + configure | 2 + doc/filters.texi | 39 +++ libavfilter/Makefile | 1 + libavfilter/allfilters.c | 1 + libavfilter/version.h | 2 +- libavfilter/vf_bilateral_cuda.c | 479 +++++++++++++++++++++++++++++++ libavfilter/vf_bilateral_cuda.cu | 191 ++++++++++++ 9 files changed, 716 insertions(+), 1 deletion(-) create mode 100644 libavfilter/vf_bilateral_cuda.c create mode 100644 libavfilter/vf_bilateral_cuda.cu diff --git a/Changelog b/Changelog index c98f18f97a..70c12df8dc 100644 --- a/Changelog +++ b/Changelog @@ -37,6 +37,7 @@ version 5.1: - PHM image format support - remap_opencl filter - added chromakey_cuda filter +- added bilateral_cuda filter version 5.0: diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h index 5837c1ad37..082e4a8ba3 100644 --- a/compat/cuda/cuda_runtime.h +++ b/compat/cuda/cuda_runtime.h @@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } static inline __device__ double fabs(double a) { return __builtin_fabs(a); } static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); } +static inline __device__ float powf(float a, float y) { return __builtin_powf(a,y); } static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); } static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } diff --git a/configure b/configure index 932ea5b553..b9e1e403b1 100755 --- a/configure +++ b/configure @@ -3151,6 +3151,8 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait" chromakey_cuda_filter_deps="ffnvcodec" chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +bilateral_cuda_filter_deps="ffnvcodec" +bilateral_cuda_filter_deps_any="cuda_nvcc cuda_llvm" hwupload_cuda_filter_deps="ffnvcodec" scale_npp_filter_deps="ffnvcodec libnpp" scale2ref_npp_filter_deps="ffnvcodec libnpp" diff --git a/doc/filters.texi b/doc/filters.texi index 1e7e98c350..dbc08163d8 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -7980,6 +7980,45 @@ Set planes to filter. Default is first only. This filter supports the all above options as @ref{commands}. +@section bilateral_cuda +CUDA accelerated bilateral filter, an edge preserving filter. +This filter is mathematically accurate thanks to the use of GPU acceleration. +For best output quality, use one to one chroma subsampling, i.e. yuv444p format. + +The filter accepts the following options: +@table @option +@item sigmaS +Set sigma of gaussian function to calculate spatial weight, also called sigma space. +Allowed range is 0.1 to 512. Default is 0.1. + +@item sigmaR +Set sigma of gaussian function to calculate color range weight, also called sigma color. +Allowed range is 0.1 to 512. Default is 0.1. + +@item window_size +Set window size of the bilateral function to determine the number of neighbours to loop on. +If the number entered is even, one will be added automatically. +Allowed range is 1 to 255. Default is 1. +@end table +@subsection Examples + +@itemize +@item +Apply the bilateral filter on a video. + +@example +./ffmpeg -v verbose \ +-hwaccel cuda -hwaccel_output_format cuda -i input.mp4 \ +-init_hw_device cuda \ +-filter_complex \ +" \ +[0:v]scale_cuda=format=yuv444p[scaled_video]; +[scaled_video]bilateral_cuda=window_size=9:sigmaS=3.0:sigmaR=50.0" \ +-an -sn -c:v h264_nvenc -cq 20 out.mp4 +@end example + +@end itemize + @section bitplanenoise Show and measure bit plane noise. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index c5c44db440..841ec47141 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -194,6 +194,7 @@ OBJS-$(CONFIG_AVGBLUR_VULKAN_FILTER) += vf_avgblur_vulkan.o vulkan.o vul OBJS-$(CONFIG_BBOX_FILTER) += bbox.o vf_bbox.o OBJS-$(CONFIG_BENCH_FILTER) += f_bench.o OBJS-$(CONFIG_BILATERAL_FILTER) += vf_bilateral.o +OBJS-$(CONFIG_BILATERAL_CUDA_FILTER) += vf_bilateral_cuda.o vf_bilateral_cuda.ptx.o OBJS-$(CONFIG_BITPLANENOISE_FILTER) += vf_bitplanenoise.o OBJS-$(CONFIG_BLACKDETECT_FILTER) += vf_blackdetect.o OBJS-$(CONFIG_BLACKFRAME_FILTER) += vf_blackframe.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 0285889e70..79e8a16bbc 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -180,6 +180,7 @@ extern const AVFilter ff_vf_avgblur_vulkan; extern const AVFilter ff_vf_bbox; extern const AVFilter ff_vf_bench; extern const AVFilter ff_vf_bilateral; +extern const AVFilter ff_vf_bilateral_cuda; extern const AVFilter ff_vf_bitplanenoise; extern const AVFilter ff_vf_blackdetect; extern const AVFilter ff_vf_blackframe; diff --git a/libavfilter/version.h b/libavfilter/version.h index fb21fad979..fc0df70dee 100644 --- a/libavfilter/version.h +++ b/libavfilter/version.h @@ -31,7 +31,7 @@ #include "version_major.h" -#define LIBAVFILTER_VERSION_MINOR 47 +#define LIBAVFILTER_VERSION_MINOR 48 #define LIBAVFILTER_VERSION_MICRO 100 diff --git a/libavfilter/vf_bilateral_cuda.c b/libavfilter/vf_bilateral_cuda.c new file mode 100644 index 0000000000..7a271f1ce0 --- /dev/null +++ b/libavfilter/vf_bilateral_cuda.c @@ -0,0 +1,479 @@ +/* + * Copyright (c) 2022 Mohamed Khaled + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include + +#include "libavutil/avstring.h" +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/internal.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" + +#include "avfilter.h" +#include "formats.h" +#include "internal.h" +#include "video.h" + +#include "cuda/load_helper.h" + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NV12, + AV_PIX_FMT_YUV444P +}; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define BLOCKX 32 +#define BLOCKY 16 + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + + +typedef struct CUDABilateralContext { + const AVClass *class; + AVCUDADeviceContext *hwctx; + + enum AVPixelFormat in_fmt, out_fmt; + const AVPixFmtDescriptor *in_desc, *out_desc; + int in_planes, out_planes; + int in_plane_depths[4]; + int in_plane_channels[4]; + + int window_size; + float sigmaS; + float sigmaR; + + AVBufferRef *frames_ctx; + AVFrame *frame; + AVFrame *tmp_frame; + + CUcontext cu_ctx; + CUmodule cu_module; + CUfunction cu_func; + CUfunction cu_func_uv; + CUstream cu_stream; +} CUDABilateralContext; + +static av_cold int cudabilateral_init(AVFilterContext *ctx) +{ + CUDABilateralContext *s = ctx->priv; + + s->frame = av_frame_alloc(); + if (!s->frame) + return AVERROR(ENOMEM); + + s->tmp_frame = av_frame_alloc(); + if (!s->tmp_frame) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold void cudabilateral_uninit(AVFilterContext *ctx) +{ + CUDABilateralContext *s = ctx->priv; + + if (s->hwctx && s->cu_module) { + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext bilateral; + + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); + } + + av_frame_free(&s->frame); + av_buffer_unref(&s->frames_ctx); + av_frame_free(&s->tmp_frame); +} + +static av_cold int init_hwframe_ctx(CUDABilateralContext *s, AVBufferRef *device_ctx, int width, int height) +{ + AVBufferRef *out_ref = NULL; + AVHWFramesContext *out_ctx; + int ret; + + out_ref = av_hwframe_ctx_alloc(device_ctx); + if (!out_ref) + return AVERROR(ENOMEM); + out_ctx = (AVHWFramesContext*)out_ref->data; + + out_ctx->format = AV_PIX_FMT_CUDA; + out_ctx->sw_format = s->out_fmt; + out_ctx->width = width; + out_ctx->height = height; + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) + goto fail; + + av_frame_unref(s->frame); + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); + if (ret < 0) + goto fail; + + av_buffer_unref(&s->frames_ctx); + s->frames_ctx = out_ref; + + return 0; +fail: + av_buffer_unref(&out_ref); + return ret; +} + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static av_cold void set_format_info(AVFilterContext *ctx, enum AVPixelFormat in_format, enum AVPixelFormat out_format) +{ + CUDABilateralContext *s = ctx->priv; + int i, p, d; + + s->in_fmt = in_format; + s->out_fmt = out_format; + + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); + s->out_desc = av_pix_fmt_desc_get(s->out_fmt); + s->in_planes = av_pix_fmt_count_planes(s->in_fmt); + s->out_planes = av_pix_fmt_count_planes(s->out_fmt); + + // find maximum step of each component of each plane + // For our subset of formats, this should accurately tell us how many channels CUDA needs + // i.e. 1 for Y plane, 2 for UV plane of NV12, 4 for single plane of RGB0 formats + + for (i = 0; i < s->in_desc->nb_components; i++) { + d = (s->in_desc->comp[i].depth + 7) / 8; + p = s->in_desc->comp[i].plane; + s->in_plane_channels[p] = FFMAX(s->in_plane_channels[p], s->in_desc->comp[i].step / d); + + s->in_plane_depths[p] = s->in_desc->comp[i].depth; + } +} + +static av_cold int init_processing_chain(AVFilterContext *ctx, int width, int height) +{ + CUDABilateralContext *s = ctx->priv; + AVHWFramesContext *in_frames_ctx; + int ret; + + /* check that we have a hw context */ + if (!ctx->inputs[0]->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); + return AVERROR(EINVAL); + } + in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data; + + if (!format_is_supported(in_frames_ctx->sw_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported format: %s\n", av_get_pix_fmt_name(in_frames_ctx->sw_format)); + return AVERROR(ENOSYS); + } + + set_format_info(ctx, in_frames_ctx->sw_format, in_frames_ctx->sw_format); + + ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height); + if (ret < 0) + return ret; + + ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx); + if (!ctx->outputs[0]->hw_frames_ctx) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold int cuda_bilateral_load_functions(AVFilterContext *ctx) +{ + CUDABilateralContext *s = ctx->priv; + CUcontext bilateral, cuda_ctx = s->hwctx->cuda_ctx; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + int ret; + + extern const unsigned char ff_vf_bilateral_cuda_ptx_data[]; + extern const unsigned int ff_vf_bilateral_cuda_ptx_len; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_bilateral_cuda_ptx_data, ff_vf_bilateral_cuda_ptx_len); + if (ret < 0) + goto fail; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, "Process_uchar")); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed loading Process_uchar\n"); + goto fail; + } + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, "Process_uchar2")); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed loading Process_uchar2\n"); + goto fail; + } + +fail: + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); + + return ret; +} + +static av_cold int cuda_bilateral_config_props(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + CUDABilateralContext *s = ctx->priv; + AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; + AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; + int ret; + + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + ret = init_processing_chain(ctx, inlink->w, inlink->h); + if (ret < 0) + return ret; + + outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; + + // the window_size makes more sense when it is odd, so add 1 if even + s->window_size= (s->window_size%2) ? s->window_size : s->window_size+1; + + ret = cuda_bilateral_load_functions(ctx); + if (ret < 0) + return ret; + + return 0; +} + +static int call_cuda_kernel(AVFilterContext *ctx, CUfunction func, + CUtexObject src_tex[3], AVFrame *out_frame, + int width, int height, int pitch, + int width_uv, int height_uv, int pitch_uv, + int window_size, float sigmaS, float sigmaR) +{ + CUDABilateralContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + int ret; + + CUdeviceptr dst_devptr[3] = { + (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1], (CUdeviceptr)out_frame->data[2] + }; + + void *args_uchar[] = { + &src_tex[0], &src_tex[1], &src_tex[2], + &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], + &width, &height, &pitch, + &width_uv, &height_uv, &pitch_uv, + &window_size, &sigmaS, &sigmaR + }; + + ret = CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + if (ret < 0) + return ret; + + return ret; +} + +static int cuda_bilateral_process_internal(AVFilterContext *ctx, + AVFrame *out, AVFrame *in) +{ + CUDABilateralContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext bilateral, cuda_ctx = s->hwctx->cuda_ctx; + int i, ret; + + CUtexObject tex[3] = { 0, 0, 0 }; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + for (i = 0; i < s->in_planes; i++) { + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_LINEAR, + .flags = 0, // CU_TRSF_READ_AS_INTEGER to get raw ints instead of normalized floats from tex2D + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT8, + .res.pitch2D.numChannels = s->in_plane_channels[i], + .res.pitch2D.pitchInBytes = in->linesize[i], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[i], + }; + + if (i == 1 || i == 2) { + res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w); + res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h); + } else { + res_desc.res.pitch2D.width = in->width; + res_desc.res.pitch2D.height = in->height; + } + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + } + + ret = call_cuda_kernel(ctx, (s->in_plane_channels[1] > 1) ? s->cu_func_uv : s->cu_func, + tex, out, + out->width, out->height, out->linesize[0], + AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w), + AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h), + out->linesize[1] >> ((s->in_plane_channels[1] > 1) ? 1 : 0), + s->window_size, s->sigmaS, s->sigmaR); + if (ret < 0) + goto exit; + +exit: + for (i = 0; i < s->in_planes; i++) + if (tex[i]) + CHECK_CU(cu->cuTexObjectDestroy(tex[i])); + + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); + + return ret; +} + +static int cuda_bilateral_process(AVFilterContext *ctx, AVFrame *out, AVFrame *in) +{ + CUDABilateralContext *s = ctx->priv; + AVFrame *src = in; + int ret; + + ret = cuda_bilateral_process_internal(ctx, s->frame, src); + if (ret < 0) + return ret; + + src = s->frame; + ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0); + if (ret < 0) + return ret; + + av_frame_move_ref(out, s->frame); + av_frame_move_ref(s->frame, s->tmp_frame); + + ret = av_frame_copy_props(out, in); + if (ret < 0) + return ret; + + return 0; +} + +static int cuda_bilateral_filter_frame(AVFilterLink *link, AVFrame *in) +{ + AVFilterContext *ctx = link->dst; + CUDABilateralContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + AVFrame *out = NULL; + CUcontext bilateral; + int ret = 0; + + out = av_frame_alloc(); + if (!out) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + goto fail; + + ret = cuda_bilateral_process(ctx, out, in); + + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); + if (ret < 0) + goto fail; + + av_frame_free(&in); + return ff_filter_frame(outlink, out); +fail: + av_frame_free(&in); + av_frame_free(&out); + return ret; +} + +#define OFFSET(x) offsetof(CUDABilateralContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) +static const AVOption options[] = { + { "sigmaS", "set spatial sigma", OFFSET(sigmaS), AV_OPT_TYPE_FLOAT, {.dbl=0.1}, 0.1, 512, FLAGS }, + { "sigmaR", "set range sigma", OFFSET(sigmaR), AV_OPT_TYPE_FLOAT, {.dbl=0.1}, 0.1, 512, FLAGS }, + { "window_size", "set neighbours window_size", OFFSET(window_size), AV_OPT_TYPE_INT, {.i64=1}, 1, 255, FLAGS }, + { NULL } +}; + +static const AVClass cuda_bilateral_class = { + .class_name = "cudabilateral", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVFilterPad cuda_bilateral_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = cuda_bilateral_filter_frame, + }, +}; + +static const AVFilterPad cuda_bilateral_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = cuda_bilateral_config_props, + }, +}; + +const AVFilter ff_vf_bilateral_cuda = { + .name = "bilateral_cuda", + .description = NULL_IF_CONFIG_SMALL("GPU accelerated bilateral filter"), + + .init = cudabilateral_init, + .uninit = cudabilateral_uninit, + + .priv_size = sizeof(CUDABilateralContext), + .priv_class = &cuda_bilateral_class, + + FILTER_INPUTS(cuda_bilateral_inputs), + FILTER_OUTPUTS(cuda_bilateral_outputs), + + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_bilateral_cuda.cu b/libavfilter/vf_bilateral_cuda.cu new file mode 100644 index 0000000000..8aba3a079f --- /dev/null +++ b/libavfilter/vf_bilateral_cuda.cu @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2022 Mohamed Khaled + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "cuda/vector_helpers.cuh" + +extern "C" +{ + +/** + * @brief calculated squared norm difference between two 3-dimension vecors ||first_vector-second_vector||^2 + * used float4 for better performance + * + * @param first_yuv first color vector + * @param second_yuv second color vecotr + * @return answer of squared norm difference + */ +__device__ static inline float norm_squared(float4 first_yuv, float4 second_yuv) +{ + float ans = 0; + ans += powf(first_yuv.x - second_yuv.x, 2); + ans += powf(first_yuv.y - second_yuv.y, 2); + ans += powf(first_yuv.z - second_yuv.z, 2); + return ans; +} + +/** + * @brief calculate w as stated in bilateral filter research paper + * + * @param first_yuv first color vector + * @param second_yuv second color vecotr + * @return the calculated w + */ +__device__ static inline float calculate_w(int x, int y, int r, int c, + float4 pixel_value, float4 neighbor_value, + float sigma_space, float sigma_color) +{ + float first_term, second_term; + first_term = (powf(x - r, 2) + powf(y - c, 2)) / (2 * sigma_space * sigma_space); + second_term = norm_squared(pixel_value, neighbor_value) / (2 * sigma_color * sigma_color); + return __expf(-first_term - second_term); +} + +/** + * @brief apply the bilateral filter on the pixel sent + * + * @param src_tex_Y Y channel of source image + * @param src_tex U channel of source image if yuv, or UV channels if format is nv12 + * @param src_tex_V V channel of source image + * @param dst_Y Y channel of destination image + * @param dst_U U channel of destination image if format is in yuv + * @param dst_V V channel of destination image if format is in yuv + * @param dst_UV UV channels of destination image if format is in nv12 + * @param width width of Y channel + * @param height height of Y channel + * @param width_uv width of UV channels + * @param height_uv height of UV channels + * @param pitch pitch of Y channel + * @param pitch_uv pitch of UV channels + * @param x x coordinate of pixel to be filtered + * @param y y coordinate of pixel to be filtered + * @param sigma_space sigma space parameter + * @param sigma_color sigma color parameter + * @param window_size window size parameter + * @return void + */ +__device__ static inline void apply_biltaeral( + cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex, cudaTextureObject_t src_tex_V, + uchar *dst_Y, uchar *dst_U, uchar *dst_V, uchar2 *dst_UV, + int width, int height, int width_uv, int height_uv, int pitch, int pitch_uv, + int x, int y, + float sigma_space, float sigma_color, int window_size) +{ + int start_r = x - window_size / 2; + int start_c = y - window_size / 2; + float4 neighbor_pixel = make_float4(0.f, 0.f, 0.f, 0.f); + float Wp = 0.f; + float4 new_pixel_value = make_float4(0.f, 0.f, 0.f, 0.f); + float w = 0.f; + + int channel_ratio = width / width_uv; // ratio between Y channel and UV channels + float4 currrent_pixel; + + if (!src_tex_V) { // format is in nv12 + float2 temp_uv = tex2D(src_tex, x/channel_ratio, y/channel_ratio) * 255.f; + currrent_pixel.x = tex2D(src_tex_Y, x, y) * 255.f; + currrent_pixel.y = temp_uv.x; + currrent_pixel.z = temp_uv.y; + currrent_pixel.w = 0.f; + } else { // format is fully planar + currrent_pixel = make_float4(tex2D(src_tex_Y, x, y) * 255.f, + tex2D(src_tex, x/channel_ratio, y/channel_ratio) * 255.f, + tex2D(src_tex_V, x/channel_ratio, y/channel_ratio) * 255.f, + 0.f); + } + + for (int i=0; i < window_size; i++) + { + for (int j=0; j < window_size; j++) + { + int r=start_r+i; + int c=start_c+j; + bool in_bounds=r>=0 && r=0 && c(src_tex, r/channel_ratio, c/channel_ratio); + neighbor_pixel=make_float4(tex2D(src_tex_Y, r, c) * 255.f, + temp_uv.x * 255.f, + temp_uv.y * 255.f, 0.f); + } else { + neighbor_pixel=make_float4(tex2D(src_tex_Y, r, c) * 255.f, + tex2D(src_tex, r/channel_ratio, c/channel_ratio) * 255.f, + tex2D(src_tex_V, r/channel_ratio, c/channel_ratio) * 255.f, 0.f); + } + w=calculate_w(x,y,r,c,currrent_pixel,neighbor_pixel,sigma_space,sigma_color); + Wp+=w; + new_pixel_value+= neighbor_pixel*w; + } + } + } + + new_pixel_value = new_pixel_value / Wp; + dst_Y[y*pitch + x] = new_pixel_value.x; + + if (!src_tex_V) { + dst_UV[(y/channel_ratio) * pitch_uv + (x/channel_ratio)] = make_uchar2(new_pixel_value.y, new_pixel_value.z); + } else { + dst_U[(y/channel_ratio) * pitch_uv + (x/channel_ratio)] = new_pixel_value.y; + dst_V[(y/channel_ratio) * pitch_uv + (x/channel_ratio)] = new_pixel_value.z; + } + + return; +} + + +__global__ void Process_uchar(cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_U, cudaTextureObject_t src_tex_V, + uchar *dst_Y, uchar *dst_U, uchar *dst_V, + int width, int height, int pitch, + int width_uv, int height_uv, int pitch_uv, + int window_size, float sigmaS, float sigmaR) +{ + + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (y >= height || x >= width) + return; + + apply_biltaeral(src_tex_Y, src_tex_U, src_tex_V, + dst_Y, dst_U, dst_V, (uchar2*)nullptr, + width, height, width_uv, height_uv, pitch, pitch_uv, + x, y, + sigmaS, sigmaR, window_size); +} + + +__global__ void Process_uchar2(cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_UV, cudaTextureObject_t unused1, + uchar *dst_Y, uchar2 *dst_UV, uchar *unused2, + int width, int height, int pitch, + int width_uv, int height_uv, int pitch_uv, + int window_size, float sigmaS, float sigmaR) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (y >= height || x >= width) + return; + + apply_biltaeral(src_tex_Y, src_tex_UV, (cudaTextureObject_t)nullptr, + dst_Y, (uchar*)nullptr, (uchar*)nullptr, dst_UV, + width, height, width_uv, height_uv, pitch, pitch_uv, + x, y, + sigmaS, sigmaR, window_size); +} + +}