From 374f2ac370e1c109fde950613d539049c18fd974 Mon Sep 17 00:00:00 2001 From: Paul B Mahol Date: Mon, 18 Oct 2021 18:03:16 +0200 Subject: [PATCH] avfilter/vf_fftdnoiz: add slice threading support --- libavfilter/vf_fftdnoiz.c | 236 +++++++++++++++++++++++++------------- 1 file changed, 154 insertions(+), 82 deletions(-) diff --git a/libavfilter/vf_fftdnoiz.c b/libavfilter/vf_fftdnoiz.c index e316d4c756..487e98b931 100644 --- a/libavfilter/vf_fftdnoiz.c +++ b/libavfilter/vf_fftdnoiz.c @@ -25,6 +25,8 @@ #include "internal.h" #include "libavcodec/avfft.h" +#define MAX_THREADS 32 + enum BufferTypes { CURRENT, PREV, @@ -40,11 +42,9 @@ typedef struct PlaneContext { float n; float *buffer[BSIZE]; - FFTComplex *hdata, *vdata; + FFTComplex *hdata[MAX_THREADS], *vdata[MAX_THREADS]; int data_linesize; int buffer_linesize; - - FFTContext *fft, *ifft; } PlaneContext; typedef struct FFTdnoizContext { @@ -62,8 +62,11 @@ typedef struct FFTdnoizContext { int depth; int nb_planes; + int nb_threads; PlaneContext planes[4]; + FFTContext *fft[MAX_THREADS], *ifft[MAX_THREADS]; + void (*import_row)(FFTComplex *dst, uint8_t *src, int rw); void (*export_row)(FFTComplex *src, uint8_t *dst, int rw, float scale, int depth); } FFTdnoizContext; @@ -90,23 +93,6 @@ static const AVOption fftdnoiz_options[] = { AVFILTER_DEFINE_CLASS(fftdnoiz); -static av_cold int init(AVFilterContext *ctx) -{ - FFTdnoizContext *s = ctx->priv; - int i; - - for (i = 0; i < 4; i++) { - PlaneContext *p = &s->planes[i]; - - p->fft = av_fft_init(s->block_bits, 0); - p->ifft = av_fft_init(s->block_bits, 1); - if (!p->fft || !p->ifft) - return AVERROR(ENOMEM); - } - - return 0; -} - static const enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY9, AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12, @@ -201,6 +187,14 @@ static int config_input(AVFilterLink *inlink) s->planes[0].planeheight = s->planes[3].planeheight = inlink->h; s->nb_planes = av_pix_fmt_count_planes(inlink->format); + s->nb_threads = FFMIN(ff_filter_get_nb_threads(ctx), MAX_THREADS); + + for (int i = 0; i < s->nb_threads; i++) { + s->fft[i] = av_fft_init(s->block_bits, 0); + s->ifft[i] = av_fft_init(s->block_bits, 1); + if (!s->fft[i] || !s->ifft[i]) + return AVERROR(ENOMEM); + } for (i = 0; i < s->nb_planes; i++) { PlaneContext *p = &s->planes[i]; @@ -230,10 +224,12 @@ static int config_input(AVFilterLink *inlink) return AVERROR(ENOMEM); } p->data_linesize = 2 * p->b * sizeof(float); - p->hdata = av_calloc(p->b, p->data_linesize); - p->vdata = av_calloc(p->b, p->data_linesize); - if (!p->hdata || !p->vdata) - return AVERROR(ENOMEM); + for (int j = 0; j < s->nb_threads; j++) { + p->hdata[j] = av_calloc(p->b, p->data_linesize); + p->vdata[j] = av_calloc(p->b, p->data_linesize); + if (!p->hdata[j] || !p->vdata[j]) + return AVERROR(ENOMEM); + } } return 0; @@ -241,7 +237,8 @@ static int config_input(AVFilterLink *inlink) static void import_plane(FFTdnoizContext *s, uint8_t *srcp, int src_linesize, - float *buffer, int buffer_linesize, int plane) + float *buffer, int buffer_linesize, int plane, + int jobnr, int nb_jobs) { PlaneContext *p = &s->planes[plane]; const int width = p->planewidth; @@ -253,12 +250,14 @@ static void import_plane(FFTdnoizContext *s, const int noy = p->noy; const int bpp = (s->depth + 7) / 8; const int data_linesize = p->data_linesize / sizeof(FFTComplex); - FFTComplex *hdata = p->hdata; - FFTComplex *vdata = p->vdata; + const int slice_start = (noy * jobnr) / nb_jobs; + const int slice_end = (noy * (jobnr+1)) / nb_jobs; + FFTComplex *hdata = p->hdata[jobnr]; + FFTComplex *vdata = p->vdata[jobnr]; int x, y, i, j; buffer_linesize /= sizeof(float); - for (y = 0; y < noy; y++) { + for (y = slice_start; y < slice_end; y++) { for (x = 0; x < nox; x++) { const int rh = FFMIN(block, height - y * size); const int rw = FFMIN(block, width - x * size); @@ -272,8 +271,8 @@ static void import_plane(FFTdnoizContext *s, dst[j].re = dst[block - j - 1].re; dst[j].im = 0; } - av_fft_permute(p->fft, dst); - av_fft_calc(p->fft, dst); + av_fft_permute(s->fft[jobnr], dst); + av_fft_calc(s->fft[jobnr], dst); src += src_linesize; dst += data_linesize; @@ -292,8 +291,8 @@ static void import_plane(FFTdnoizContext *s, for (i = 0; i < block; i++) { for (j = 0; j < block; j++) dst[j] = ssrc[j * data_linesize + i]; - av_fft_permute(p->fft, dst); - av_fft_calc(p->fft, dst); + av_fft_permute(s->fft[jobnr], dst); + av_fft_calc(s->fft[jobnr], dst); memcpy(bdst, dst, block * sizeof(FFTComplex)); dst += data_linesize; @@ -305,7 +304,8 @@ static void import_plane(FFTdnoizContext *s, static void export_plane(FFTdnoizContext *s, uint8_t *dstp, int dst_linesize, - float *buffer, int buffer_linesize, int plane) + float *buffer, int buffer_linesize, int plane, + int jobnr, int nb_jobs) { PlaneContext *p = &s->planes[plane]; const int depth = s->depth; @@ -320,12 +320,14 @@ static void export_plane(FFTdnoizContext *s, const int noy = p->noy; const int data_linesize = p->data_linesize / sizeof(FFTComplex); const float scale = 1.f / (block * block); - FFTComplex *hdata = p->hdata; - FFTComplex *vdata = p->vdata; + const int slice_start = (noy * jobnr) / nb_jobs; + const int slice_end = (noy * (jobnr+1)) / nb_jobs; + FFTComplex *hdata = p->hdata[jobnr]; + FFTComplex *vdata = p->vdata[jobnr]; int x, y, i, j; buffer_linesize /= sizeof(float); - for (y = 0; y < noy; y++) { + for (y = slice_start; y < slice_end; y++) { for (x = 0; x < nox; x++) { const int woff = x == 0 ? 0 : hoverlap; const int hoff = y == 0 ? 0 : hoverlap; @@ -338,8 +340,8 @@ static void export_plane(FFTdnoizContext *s, hdst = hdata; for (i = 0; i < block; i++) { memcpy(ddst, bsrc, block * sizeof(FFTComplex)); - av_fft_permute(p->ifft, ddst); - av_fft_calc(p->ifft, ddst); + av_fft_permute(s->ifft[jobnr], ddst); + av_fft_calc(s->ifft[jobnr], ddst); for (j = 0; j < block; j++) { hdst[j * data_linesize + i] = ddst[j]; } @@ -350,8 +352,8 @@ static void export_plane(FFTdnoizContext *s, hdst = hdata + hoff * data_linesize; for (i = 0; i < rh; i++) { - av_fft_permute(p->ifft, hdst); - av_fft_calc(p->ifft, hdst); + av_fft_permute(s->ifft[jobnr], hdst); + av_fft_calc(s->ifft[jobnr], hdst); s->export_row(hdst + woff, dst, rw, scale, depth); hdst += data_linesize; @@ -361,7 +363,8 @@ static void export_plane(FFTdnoizContext *s, } } -static void filter_plane3d2(FFTdnoizContext *s, int plane, float *pbuffer, float *nbuffer) +static void filter_plane3d2(FFTdnoizContext *s, int plane, float *pbuffer, float *nbuffer, + int jobnr, int nb_jobs) { PlaneContext *p = &s->planes[plane]; const int block = p->b; @@ -369,13 +372,15 @@ static void filter_plane3d2(FFTdnoizContext *s, int plane, float *pbuffer, float const int noy = p->noy; const int buffer_linesize = p->buffer_linesize / sizeof(float); const float sigma = s->sigma * s->sigma * block * block; + const int slice_start = (noy * jobnr) / nb_jobs; + const int slice_end = (noy * (jobnr+1)) / nb_jobs; const float limit = 1.f - s->amount; float *cbuffer = p->buffer[CURRENT]; const float cfactor = sqrtf(3.f) * 0.5f; const float scale = 1.f / 3.f; int y, x, i, j; - for (y = 0; y < noy; y++) { + for (y = slice_start; y < slice_end; y++) { for (x = 0; x < nox; x++) { float *cbuff = cbuffer + buffer_linesize * y * block + x * block * 2; float *pbuff = pbuffer + buffer_linesize * y * block + x * block * 2; @@ -421,7 +426,8 @@ static void filter_plane3d2(FFTdnoizContext *s, int plane, float *pbuffer, float } } -static void filter_plane3d1(FFTdnoizContext *s, int plane, float *pbuffer) +static void filter_plane3d1(FFTdnoizContext *s, int plane, float *pbuffer, + int jobnr, int nb_jobs) { PlaneContext *p = &s->planes[plane]; const int block = p->b; @@ -429,11 +435,13 @@ static void filter_plane3d1(FFTdnoizContext *s, int plane, float *pbuffer) const int noy = p->noy; const int buffer_linesize = p->buffer_linesize / sizeof(float); const float sigma = s->sigma * s->sigma * block * block; + const int slice_start = (noy * jobnr) / nb_jobs; + const int slice_end = (noy * (jobnr+1)) / nb_jobs; const float limit = 1.f - s->amount; float *cbuffer = p->buffer[CURRENT]; int y, x, i, j; - for (y = 0; y < noy; y++) { + for (y = slice_start; y < slice_end; y++) { for (x = 0; x < nox; x++) { float *cbuff = cbuffer + buffer_linesize * y * block + x * block * 2; float *pbuff = pbuffer + buffer_linesize * y * block + x * block * 2; @@ -473,7 +481,8 @@ static void filter_plane3d1(FFTdnoizContext *s, int plane, float *pbuffer) } } -static void filter_plane2d(FFTdnoizContext *s, int plane) +static void filter_plane2d(FFTdnoizContext *s, int plane, + int jobnr, int nb_jobs) { PlaneContext *p = &s->planes[plane]; const int block = p->b; @@ -482,15 +491,16 @@ static void filter_plane2d(FFTdnoizContext *s, int plane) const int buffer_linesize = p->buffer_linesize / 4; const float sigma = s->sigma * s->sigma * block * block; const float limit = 1.f - s->amount; + const int slice_start = (noy * jobnr) / nb_jobs; + const int slice_end = (noy * (jobnr+1)) / nb_jobs; float *buffer = p->buffer[CURRENT]; - int y, x, i, j; - for (y = 0; y < noy; y++) { - for (x = 0; x < nox; x++) { + for (int y = slice_start; y < slice_end; y++) { + for (int x = 0; x < nox; x++) { float *buff = buffer + buffer_linesize * y * block + x * block * 2; - for (i = 0; i < block; i++) { - for (j = 0; j < block; j++) { + for (int i = 0; i < block; i++) { + for (int j = 0; j < block; j++) { float factor, power, re, im; re = buff[j * 2 ]; @@ -507,6 +517,79 @@ static void filter_plane2d(FFTdnoizContext *s, int plane) } } +static int import_pass(AVFilterContext *ctx, void *arg, + int jobnr, int nb_jobs) +{ + FFTdnoizContext *s = ctx->priv; + + for (int plane = 0; plane < s->nb_planes; plane++) { + PlaneContext *p = &s->planes[plane]; + + if (!((1 << plane) & s->planesf) || ctx->is_disabled) + continue; + + if (s->next) { + import_plane(s, s->next->data[plane], s->next->linesize[plane], + p->buffer[NEXT], p->buffer_linesize, plane, + jobnr, nb_jobs); + } + + if (s->prev) { + import_plane(s, s->prev->data[plane], s->prev->linesize[plane], + p->buffer[PREV], p->buffer_linesize, plane, + jobnr, nb_jobs); + } + + import_plane(s, s->cur->data[plane], s->cur->linesize[plane], + p->buffer[CURRENT], p->buffer_linesize, plane, + jobnr, nb_jobs); + } + + return 0; +} + +static int filter_pass(AVFilterContext *ctx, void *arg, + int jobnr, int nb_jobs) +{ + FFTdnoizContext *s = ctx->priv; + + for (int plane = 0; plane < s->nb_planes; plane++) { + PlaneContext *p = &s->planes[plane]; + + if (!((1 << plane) & s->planesf) || ctx->is_disabled) + continue; + + if (s->next && s->prev) { + filter_plane3d2(s, plane, p->buffer[PREV], p->buffer[NEXT], jobnr, nb_jobs); + } else if (s->next) { + filter_plane3d1(s, plane, p->buffer[NEXT], jobnr, nb_jobs); + } else if (s->prev) { + filter_plane3d1(s, plane, p->buffer[PREV], jobnr, nb_jobs); + } else { + filter_plane2d(s, plane, jobnr, nb_jobs); + } + } + + return 0; +} + +static int export_pass(AVFilterContext *ctx, void *arg, + int jobnr, int nb_jobs) +{ + FFTdnoizContext *s = ctx->priv; + AVFrame *out = arg; + + for (int plane = 0; plane < s->nb_planes; plane++) { + PlaneContext *p = &s->planes[plane]; + + export_plane(s, out->data[plane], out->linesize[plane], + p->buffer[CURRENT], p->buffer_linesize, plane, + jobnr, nb_jobs); + } + + return 0; +} + static int filter_frame(AVFilterLink *inlink, AVFrame *in) { AVFilterContext *ctx = inlink->dst; @@ -559,6 +642,15 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) av_frame_copy_props(out, s->cur); } + ff_filter_execute(ctx, import_pass, NULL, NULL, + FFMIN(s->planes[0].noy, s->nb_threads)); + + ff_filter_execute(ctx, filter_pass, NULL, NULL, + FFMIN(s->planes[0].noy, s->nb_threads)); + + ff_filter_execute(ctx, export_pass, out, NULL, + FFMIN(s->planes[0].noy, s->nb_threads)); + for (plane = 0; plane < s->nb_planes; plane++) { PlaneContext *p = &s->planes[plane]; @@ -569,32 +661,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) p->planewidth, p->planeheight); continue; } - - if (s->next) { - import_plane(s, s->next->data[plane], s->next->linesize[plane], - p->buffer[NEXT], p->buffer_linesize, plane); - } - - if (s->prev) { - import_plane(s, s->prev->data[plane], s->prev->linesize[plane], - p->buffer[PREV], p->buffer_linesize, plane); - } - - import_plane(s, s->cur->data[plane], s->cur->linesize[plane], - p->buffer[CURRENT], p->buffer_linesize, plane); - - if (s->next && s->prev) { - filter_plane3d2(s, plane, p->buffer[PREV], p->buffer[NEXT]); - } else if (s->next) { - filter_plane3d1(s, plane, p->buffer[NEXT]); - } else if (s->prev) { - filter_plane3d1(s, plane, p->buffer[PREV]); - } else { - filter_plane2d(s, plane); - } - - export_plane(s, out->data[plane], out->linesize[plane], - p->buffer[CURRENT], p->buffer_linesize, plane); } if (s->nb_next == 0 && s->nb_prev == 0) { @@ -644,13 +710,19 @@ static av_cold void uninit(AVFilterContext *ctx) for (i = 0; i < 4; i++) { PlaneContext *p = &s->planes[i]; - av_freep(&p->hdata); - av_freep(&p->vdata); + for (int j = 0; j < s->nb_threads; j++) { + av_freep(&p->hdata[j]); + av_freep(&p->vdata[j]); + } + av_freep(&p->buffer[PREV]); av_freep(&p->buffer[CURRENT]); av_freep(&p->buffer[NEXT]); - av_fft_end(p->fft); - av_fft_end(p->ifft); + } + + for (i = 0; i < s->nb_threads; i++) { + av_fft_end(s->fft[i]); + av_fft_end(s->ifft[i]); } av_frame_free(&s->prev); @@ -679,11 +751,11 @@ const AVFilter ff_vf_fftdnoiz = { .name = "fftdnoiz", .description = NULL_IF_CONFIG_SMALL("Denoise frames using 3D FFT."), .priv_size = sizeof(FFTdnoizContext), - .init = init, .uninit = uninit, FILTER_INPUTS(fftdnoiz_inputs), FILTER_OUTPUTS(fftdnoiz_outputs), FILTER_PIXFMTS_ARRAY(pix_fmts), .priv_class = &fftdnoiz_class, - .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL, + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL | + AVFILTER_FLAG_SLICE_THREADS, };