diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h index 353efcf5f9..590c2d1bb0 100644 --- a/compat/cuda/cuda_runtime.h +++ b/compat/cuda/cuda_runtime.h @@ -182,4 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } static inline __device__ double fabs(double a) { return __builtin_fabs(a); } +static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } +static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); } + #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */ diff --git a/libavfilter/version.h b/libavfilter/version.h index 2db35f85af..44264e12cb 100644 --- a/libavfilter/version.h +++ b/libavfilter/version.h @@ -31,7 +31,7 @@ #define LIBAVFILTER_VERSION_MAJOR 7 #define LIBAVFILTER_VERSION_MINOR 88 -#define LIBAVFILTER_VERSION_MICRO 101 +#define LIBAVFILTER_VERSION_MICRO 102 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \ diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index dfa638dbf7..f6401b35b0 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -59,6 +59,7 @@ enum { INTERP_ALGO_NEAREST, INTERP_ALGO_BILINEAR, INTERP_ALGO_BICUBIC, + INTERP_ALGO_LANCZOS, INTERP_ALGO_COUNT }; @@ -293,6 +294,12 @@ static av_cold int cudascale_config_props(AVFilterLink *outlink) s->interp_use_linear = 0; s->interp_as_integer = 0; break; + case INTERP_ALGO_LANCZOS: + scaler_ptx = vf_scale_cuda_bicubic_ptx; + function_infix = "_Lanczos"; + s->interp_use_linear = 0; + s->interp_as_integer = 0; + break; default: av_log(ctx, AV_LOG_ERROR, "Unknown interpolation algorithm\n"); return AVERROR_BUG; @@ -601,6 +608,7 @@ static const AVOption options[] = { { "nearest", "nearest neighbour", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_NEAREST }, 0, 0, FLAGS, "interp_algo" }, { "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_BILINEAR }, 0, 0, FLAGS, "interp_algo" }, { "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_BICUBIC }, 0, 0, FLAGS, "interp_algo" }, + { "lanczos", "lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_LANCZOS }, 0, 0, FLAGS, "interp_algo" }, { "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS }, { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 2, FLAGS, "force_oar" }, { "disable", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, 0, 0, FLAGS, "force_oar" }, diff --git a/libavfilter/vf_scale_cuda_bicubic.cu b/libavfilter/vf_scale_cuda_bicubic.cu index 8a27927e60..fe451ec54b 100644 --- a/libavfilter/vf_scale_cuda_bicubic.cu +++ b/libavfilter/vf_scale_cuda_bicubic.cu @@ -22,6 +22,30 @@ #include "cuda/vector_helpers.cuh" +typedef float4 (*coeffs_function_t)(float); + +__device__ inline float4 lanczos_coeffs(float x) +{ + const float pi = 3.141592654f; + + float4 res = make_float4( + pi * (x + 1), + pi * x, + pi * (x - 1), + pi * (x - 2)); + + res.x = res.x == 0.0f ? 1.0f : + __sinf(res.x) * __sinf(res.x / 2.0f) / (res.x * res.x / 2.0f); + res.y = res.y == 0.0f ? 1.0f : + __sinf(res.y) * __sinf(res.y / 2.0f) / (res.y * res.y / 2.0f); + res.z = res.z == 0.0f ? 1.0f : + __sinf(res.z) * __sinf(res.z / 2.0f) / (res.z * res.z / 2.0f); + res.w = res.w == 0.0f ? 1.0f : + __sinf(res.w) * __sinf(res.w / 2.0f) / (res.w * res.w / 2.0f); + + return res / (res.x + res.y + res.z + res.w); +} + __device__ inline float4 bicubic_coeffs(float x) { const float A = -0.75f; @@ -35,10 +59,8 @@ __device__ inline float4 bicubic_coeffs(float x) return res; } -__device__ inline void bicubic_fast_coeffs(float x, float *h0, float *h1, float *s) +__device__ inline void derived_fast_coeffs(float4 coeffs, float x, float *h0, float *h1, float *s) { - float4 coeffs = bicubic_coeffs(x); - float g0 = coeffs.x + coeffs.y; float g1 = coeffs.z + coeffs.w; @@ -48,7 +70,7 @@ __device__ inline void bicubic_fast_coeffs(float x, float *h0, float *h1, float } template -__device__ inline V bicubic_filter(float4 coeffs, V c0, V c1, V c2, V c3) +__device__ inline V apply_coeffs(float4 coeffs, V c0, V c1, V c2, V c3) { V res = c0 * coeffs.x; res += c1 * coeffs.y; @@ -59,7 +81,8 @@ __device__ inline V bicubic_filter(float4 coeffs, V c0, V c1, V c2, V c3) } template -__device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex, +__device__ inline void Subsample_Bicubic(coeffs_function_t coeffs_function, + cudaTextureObject_t src_tex, T *dst, int dst_width, int dst_height, int dst_pitch, int src_width, int src_height, @@ -81,17 +104,17 @@ __device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex, float factor = bit_depth > 8 ? 0xFFFF : 0xFF; - float4 coeffsX = bicubic_coeffs(fx); - float4 coeffsY = bicubic_coeffs(fy); + float4 coeffsX = coeffs_function(fx); + float4 coeffsY = coeffs_function(fy); #define PIX(x, y) tex2D(src_tex, (x), (y)) dst[yo * dst_pitch + xo] = from_floatN( - bicubic_filter(coeffsY, - bicubic_filter(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)), - bicubic_filter(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )), - bicubic_filter(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)), - bicubic_filter(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2)) + apply_coeffs(coeffsY, + apply_coeffs(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)), + apply_coeffs(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )), + apply_coeffs(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)), + apply_coeffs(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2)) ) * factor ); @@ -101,7 +124,8 @@ __device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex, /* This does not yield correct results. Most likely because of low internal precision in tex2D linear interpolation */ template -__device__ inline void Subsample_FastBicubic(cudaTextureObject_t src_tex, +__device__ inline void Subsample_FastBicubic(coeffs_function_t coeffs_function, + cudaTextureObject_t src_tex, T *dst, int dst_width, int dst_height, int dst_pitch, int src_width, int src_height, @@ -123,10 +147,13 @@ __device__ inline void Subsample_FastBicubic(cudaTextureObject_t src_tex, float factor = bit_depth > 8 ? 0xFFFF : 0xFF; + float4 coeffsX = coeffs_function(fx); + float4 coeffsY = coeffs_function(fy); + float h0x, h1x, sx; float h0y, h1y, sy; - bicubic_fast_coeffs(fx, &h0x, &h1x, &sx); - bicubic_fast_coeffs(fy, &h0y, &h1y, &sy); + derived_fast_coeffs(coeffsX, fx, &h0x, &h1x, &sx); + derived_fast_coeffs(coeffsY, fy, &h0y, &h1y, &sy); #define PIX(x, y) tex2D(src_tex, (x), (y)) @@ -157,7 +184,7 @@ extern "C" { int src_width, int src_height, \ int bit_depth) \ { \ - Subsample_Bicubic(src_tex, dst, \ + Subsample_Bicubic(&bicubic_coeffs, src_tex, dst, \ dst_width, dst_height, dst_pitch, \ src_width, src_height, \ bit_depth); \ @@ -171,4 +198,26 @@ BICUBIC_KERNEL(ushort) BICUBIC_KERNEL(ushort2) BICUBIC_KERNEL(ushort4) + +#define LANCZOS_KERNEL(T) \ + __global__ void Subsample_Lanczos_ ## T(cudaTextureObject_t src_tex, \ + T *dst, \ + int dst_width, int dst_height, int dst_pitch, \ + int src_width, int src_height, \ + int bit_depth) \ + { \ + Subsample_Bicubic(&lanczos_coeffs, src_tex, dst, \ + dst_width, dst_height, dst_pitch, \ + src_width, src_height, \ + bit_depth); \ + } + +LANCZOS_KERNEL(uchar) +LANCZOS_KERNEL(uchar2) +LANCZOS_KERNEL(uchar4) + +LANCZOS_KERNEL(ushort) +LANCZOS_KERNEL(ushort2) +LANCZOS_KERNEL(ushort4) + }