Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-05-06 Thread Song, Ruiling


> -Original Message-
> From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf
> Of Mark Thompson
> Sent: Monday, May 6, 2019 10:20 PM
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> nlmeans_opencl filter
> 
> On 29/04/2019 03:06, Song, Ruiling wrote:>
> > In order to verify the patch, I also have more testing on the CPU OpenCL
> driver from Intel.
> > I make it run 100 times, and still not see any reported overflow. So I think
> we can say the filter is in good quality to be merged. Any different idea?
> 
> I've tried a lot more times on some additional platforms (Skylake-GT3, Mali-
> G52) and I can't reproduce it on anything else.  So, I think I agree that it 
> must
> be a driver issue and shouldn't block anything.
> 
> 
> On 12/04/2019 16:09, Ruiling Song wrote:
> > Signed-off-by: Ruiling Song 
> > ---
> >  configure   |   1 +
> >  doc/filters.texi|   4 +
> >  libavfilter/Makefile|   1 +
> >  libavfilter/allfilters.c|   1 +
> >  libavfilter/opencl/nlmeans.cl   | 115 +
> >  libavfilter/opencl_source.h |   1 +
> >  libavfilter/vf_nlmeans_opencl.c | 442
> 
> >  7 files changed, 565 insertions(+)
> >  create mode 100644 libavfilter/opencl/nlmeans.cl
> >  create mode 100644 libavfilter/vf_nlmeans_opencl.c
> >
> > ...
> > +
> > +static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
> > + cl_int width, cl_int height, cl_int p, cl_int r)
> > +{
> > +NLMeansOpenCLContext *ctx = avctx->priv;
> > +const float zero = 0.0f;
> > +const size_t worksize1[] = {height};
> > +const size_t worksize2[] = {width};
> > +const size_t worksize3[2] = {width, height};
> > +int dx, dy, err = 0, weight_buf_size;
> > +cl_int cle;
> > +int nb_pixel, *tmp, idx = 0;
> > +cl_int *dxdy;
> > +
> > +weight_buf_size = width * height * sizeof(float);
> > +cle = clEnqueueFillBuffer(ctx->command_queue, ctx->weight,
> > +  , sizeof(float), 0, weight_buf_size,
> > +  0, NULL, NULL);
> > +CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",
> > + cle);
> > +cle = clEnqueueFillBuffer(ctx->command_queue, ctx->sum,
> > +  , sizeof(float), 0, weight_buf_size,
> > +  0, NULL, NULL);
> > +CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",
> > + cle);
> > +
> > +nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
> > +dxdy = av_malloc(nb_pixel * 2 * sizeof(cl_int));
> > +tmp = av_malloc(nb_pixel * 2 * sizeof(int));
> > +
> > +if (!dxdy || !tmp)
> > +goto fail;
> > +
> > +for (dx = -r; dx <= r; dx++) {
> > +for (dy = -r; dy <= r; dy++) {
> > +if (dx || dy) {
> > +tmp[idx++] = dx;
> > +tmp[idx++] = dy;
> > +}
> > +}
> > +}
> > +// repack dx/dy seperately, as we want to do four pairs of dx/dy in a
> batch
> > +for (int i = 0; i < nb_pixel / 4; i++) {
> > +dxdy[i * 8] = tmp[i * 8]; // dx0
> > +dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
> > +dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
> > +dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
> > +dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
> > +dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
> > +dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
> > +dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
> > +}
> > +av_freep();
> > +
> > +for (int i = 0; i < nb_pixel / 4; i++) {
> > +int *dx_cur = dxdy + 8 * i;
> > +int *dy_cur = dxdy + 8 * i + 4;
> 
> cl_int.
Fixed
> 
> > +
> > +// horizontal pass
> > +// integral(x,y) = sum([u(v,y) - u(v+dx,y+dy)]^2) for v in [0, x]
> > +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, 
> >integral_img);
> > +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, );
> > +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, );
> > +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, );
> > +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);
> > +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_

Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-05-06 Thread Mark Thompson
On 29/04/2019 03:06, Song, Ruiling wrote:> 
> In order to verify the patch, I also have more testing on the CPU OpenCL 
> driver from Intel.
> I make it run 100 times, and still not see any reported overflow. So I think 
> we can say the filter is in good quality to be merged. Any different idea?

I've tried a lot more times on some additional platforms (Skylake-GT3, 
Mali-G52) and I can't reproduce it on anything else.  So, I think I agree that 
it must be a driver issue and shouldn't block anything.


On 12/04/2019 16:09, Ruiling Song wrote:
> Signed-off-by: Ruiling Song 
> ---
>  configure   |   1 +
>  doc/filters.texi|   4 +
>  libavfilter/Makefile|   1 +
>  libavfilter/allfilters.c|   1 +
>  libavfilter/opencl/nlmeans.cl   | 115 +
>  libavfilter/opencl_source.h |   1 +
>  libavfilter/vf_nlmeans_opencl.c | 442 
>  7 files changed, 565 insertions(+)
>  create mode 100644 libavfilter/opencl/nlmeans.cl
>  create mode 100644 libavfilter/vf_nlmeans_opencl.c
> 
> ...
> +
> +static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
> + cl_int width, cl_int height, cl_int p, cl_int r)
> +{
> +NLMeansOpenCLContext *ctx = avctx->priv;
> +const float zero = 0.0f;
> +const size_t worksize1[] = {height};
> +const size_t worksize2[] = {width};
> +const size_t worksize3[2] = {width, height};
> +int dx, dy, err = 0, weight_buf_size;
> +cl_int cle;
> +int nb_pixel, *tmp, idx = 0;
> +cl_int *dxdy;
> +
> +weight_buf_size = width * height * sizeof(float);
> +cle = clEnqueueFillBuffer(ctx->command_queue, ctx->weight,
> +  , sizeof(float), 0, weight_buf_size,
> +  0, NULL, NULL);
> +CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",
> + cle);
> +cle = clEnqueueFillBuffer(ctx->command_queue, ctx->sum,
> +  , sizeof(float), 0, weight_buf_size,
> +  0, NULL, NULL);
> +CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",
> + cle);
> +
> +nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
> +dxdy = av_malloc(nb_pixel * 2 * sizeof(cl_int));
> +tmp = av_malloc(nb_pixel * 2 * sizeof(int));
> +
> +if (!dxdy || !tmp)
> +goto fail;
> +
> +for (dx = -r; dx <= r; dx++) {
> +for (dy = -r; dy <= r; dy++) {
> +if (dx || dy) {
> +tmp[idx++] = dx;
> +tmp[idx++] = dy;
> +}
> +}
> +}
> +// repack dx/dy seperately, as we want to do four pairs of dx/dy in a 
> batch
> +for (int i = 0; i < nb_pixel / 4; i++) {
> +dxdy[i * 8] = tmp[i * 8]; // dx0
> +dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
> +dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
> +dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
> +dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
> +dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
> +dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
> +dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
> +}
> +av_freep();
> +
> +for (int i = 0; i < nb_pixel / 4; i++) {
> +int *dx_cur = dxdy + 8 * i;
> +int *dy_cur = dxdy + 8 * i + 4;

cl_int.

> +
> +// horizontal pass
> +// integral(x,y) = sum([u(v,y) - u(v+dx,y+dy)]^2) for v in [0, x]
> +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, >integral_img);
> +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, );
> +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, );
> +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, );
> +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);
> +CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur);
> +cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->horiz_kernel, 
> 1,
> +   NULL, worksize1, NULL, 0, NULL, NULL);
> +CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue horiz_kernel: 
> %d.\n",
> + cle);
> +// vertical pass
> +// integral(x, y) = sum(integral(x, v)) for v in [0, y]
> +CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, >integral_img);
> +CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, >overflow);
> +CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, );
> +CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, );
> +cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
> + 1, NULL, worksize2, NULL, 0, NULL, 
> NULL);
> +CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: 
> %d.\n",
> + cle);
> +
> +// accumlate weights
> +CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, >sum);
> +CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, >weight);
> +

Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-05-05 Thread Song, Ruiling
Will apply.

> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-04-28 Thread Song, Ruiling


> -Original Message-
> From: Song, Ruiling
> Sent: Tuesday, April 23, 2019 4:52 PM
> To: 'FFmpeg development discussions and patches'  de...@ffmpeg.org>
> Subject: RE: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> nlmeans_opencl filter
> 
> 
> 
> > -Original Message-
> > From: Song, Ruiling
> > Sent: Sunday, April 21, 2019 8:18 PM
> > To: FFmpeg development discussions and patches  > de...@ffmpeg.org>
> > Subject: RE: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> > nlmeans_opencl filter
> >
> >
> >
> > > -Original Message-
> > > From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On
> > Behalf Of
> > > Mark Thompson
> > > Sent: Saturday, April 20, 2019 11:08 PM
> > > To: ffmpeg-devel@ffmpeg.org
> > > Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> > nlmeans_opencl
> > > filter
> > >
> > > On 17/04/2019 03:43, Song, Ruiling wrote:
> > > >> -Original Message-
> > > >> From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On
> > Behalf
> > > Of
> > > >> Mark Thompson
> > > >> Sent: Wednesday, April 17, 2019 5:28 AM
> > > >> To: ffmpeg-devel@ffmpeg.org
> > > >> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> > > nlmeans_opencl
> > > >> filter
> > > >>
> > > >> On 12/04/2019 16:09, Ruiling Song wrote:
> > > >>> Signed-off-by: Ruiling Song 
> > > >>
> > > >> I can't work out where the problem is, but there is something really
> > weirdly
> > > >> nondeterministic going on here.
> > > >>
> > > >> E.g.
> > > >>
> > > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-
> 120-
> > > mbps-
> > > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> > > >>
> >
> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> > > >> frames:v 10 -f framemd5 -
> > > >> ...
> > > >> 0,  0,  0,1, 12441600, 
> > > >> 8b8805818076b23ae6f80ec2b5a349d4
> > > >> 0,  1,  1,1, 12441600, 
> > > >> 7a7fdaa083dc337cfb6af31b643f30a3
> > > >> 0,  2,  2,1, 12441600, 
> > > >> b10ef2a1e5125cc67e262e086f8040b5
> > > >> 0,  3,  3,1, 12441600, 
> > > >> c06b53ad90e0357e537df41b63d5b1dc
> > > >> 0,  4,  4,1, 12441600, 
> > > >> 5aa2da07703859a3dee080847dd17d46
> > > >> 0,  5,  5,1, 12441600, 
> > > >> 733364c6be6af825057e905a6092937d
> > > >> 0,  6,  6,1, 12441600, 
> > > >> 47edae2dec956a582b04babb745d26b0
> > > >> 0,  7,  7,1, 12441600, 
> > > >> 4e45fe8268df4298d06a17ab8e46c3e9
> > > >> 0,  8,  8,1, 12441600, 
> > > >> 960d722a3f8787c9191299a114c04174
> > > >> 0,  9,  9,1, 12441600, 
> > > >> e759c07ee4834a9cf94bfcb4128e7612
> > > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-
> 120-
> > > mbps-
> > > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> > > >>
> >
> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> > > >> frames:v 10 -f framemd5 -
> > > >> 0,  0,  0,1, 12441600, 
> > > >> 8b8805818076b23ae6f80ec2b5a349d4
> > > >> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image
> > overflow
> > > >> 2157538
> > > >> 0,  1,  1,1, 12441600, 
> > > >> bce72e10a9f1118940c5a8392ad78ec3
> > > >> 0,  2,  2,1, 12441600, 
> > > >> b10ef2a1e5125cc67e262e086f8040b5
> > > >> 0,  3,  3,1, 12441600, 
> > > >> c06b53ad90e0357e537df41b63d5b1dc
> > > >> 0,  4,  4,1, 12441600, 
> > > >> 5aa2da07703859a3dee080847dd17d46
> > > >> 0,  5,  5,1, 12441600, 
> > > >> 733364c6be6af825057e905a6092937d
> > > >> 0,  6,  6,1, 12441600, 

Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-04-23 Thread Song, Ruiling


> -Original Message-
> From: Song, Ruiling
> Sent: Sunday, April 21, 2019 8:18 PM
> To: FFmpeg development discussions and patches  de...@ffmpeg.org>
> Subject: RE: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> nlmeans_opencl filter
> 
> 
> 
> > -Original Message-
> > From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On
> Behalf Of
> > Mark Thompson
> > Sent: Saturday, April 20, 2019 11:08 PM
> > To: ffmpeg-devel@ffmpeg.org
> > Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> nlmeans_opencl
> > filter
> >
> > On 17/04/2019 03:43, Song, Ruiling wrote:
> > >> -Original Message-
> > >> From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On
> Behalf
> > Of
> > >> Mark Thompson
> > >> Sent: Wednesday, April 17, 2019 5:28 AM
> > >> To: ffmpeg-devel@ffmpeg.org
> > >> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> > nlmeans_opencl
> > >> filter
> > >>
> > >> On 12/04/2019 16:09, Ruiling Song wrote:
> > >>> Signed-off-by: Ruiling Song 
> > >>
> > >> I can't work out where the problem is, but there is something really
> weirdly
> > >> nondeterministic going on here.
> > >>
> > >> E.g.
> > >>
> > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-
> > mbps-
> > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> > >>
> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> > >> frames:v 10 -f framemd5 -
> > >> ...
> > >> 0,  0,  0,1, 12441600, 
> > >> 8b8805818076b23ae6f80ec2b5a349d4
> > >> 0,  1,  1,1, 12441600, 
> > >> 7a7fdaa083dc337cfb6af31b643f30a3
> > >> 0,  2,  2,1, 12441600, 
> > >> b10ef2a1e5125cc67e262e086f8040b5
> > >> 0,  3,  3,1, 12441600, 
> > >> c06b53ad90e0357e537df41b63d5b1dc
> > >> 0,  4,  4,1, 12441600, 
> > >> 5aa2da07703859a3dee080847dd17d46
> > >> 0,  5,  5,1, 12441600, 
> > >> 733364c6be6af825057e905a6092937d
> > >> 0,  6,  6,1, 12441600, 
> > >> 47edae2dec956a582b04babb745d26b0
> > >> 0,  7,  7,1, 12441600, 
> > >> 4e45fe8268df4298d06a17ab8e46c3e9
> > >> 0,  8,  8,1, 12441600, 
> > >> 960d722a3f8787c9191299a114c04174
> > >> 0,  9,  9,1, 12441600, 
> > >> e759c07ee4834a9cf94bfcb4128e7612
> > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-
> > mbps-
> > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> > >>
> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> > >> frames:v 10 -f framemd5 -
> > >> 0,  0,  0,1, 12441600, 
> > >> 8b8805818076b23ae6f80ec2b5a349d4
> > >> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image
> overflow
> > >> 2157538
> > >> 0,  1,  1,1, 12441600, 
> > >> bce72e10a9f1118940c5a8392ad78ec3
> > >> 0,  2,  2,1, 12441600, 
> > >> b10ef2a1e5125cc67e262e086f8040b5
> > >> 0,  3,  3,1, 12441600, 
> > >> c06b53ad90e0357e537df41b63d5b1dc
> > >> 0,  4,  4,1, 12441600, 
> > >> 5aa2da07703859a3dee080847dd17d46
> > >> 0,  5,  5,1, 12441600, 
> > >> 733364c6be6af825057e905a6092937d
> > >> 0,  6,  6,1, 12441600, 
> > >> 47edae2dec956a582b04babb745d26b0
> > >> 0,  7,  7,1, 12441600, 
> > >> 4e45fe8268df4298d06a17ab8e46c3e9
> > >> 0,  8,  8,1, 12441600, 
> > >> 960d722a3f8787c9191299a114c04174
> > >> 0,  9,  9,1, 12441600, 
> > >> e759c07ee4834a9cf94bfcb4128e7612
> > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-
> > mbps-
> > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> > >>
> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> > >> frames:v 10 -f framemd5 -
>

Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-04-21 Thread Song, Ruiling


> -Original Message-
> From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of
> Mark Thompson
> Sent: Saturday, April 20, 2019 11:08 PM
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl
> filter
> 
> On 17/04/2019 03:43, Song, Ruiling wrote:
> >> -Original Message-
> >> From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf
> Of
> >> Mark Thompson
> >> Sent: Wednesday, April 17, 2019 5:28 AM
> >> To: ffmpeg-devel@ffmpeg.org
> >> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add
> nlmeans_opencl
> >> filter
> >>
> >> On 12/04/2019 16:09, Ruiling Song wrote:
> >>> Signed-off-by: Ruiling Song 
> >>
> >> I can't work out where the problem is, but there is something really 
> >> weirdly
> >> nondeterministic going on here.
> >>
> >> E.g.
> >>
> >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-
> mbps-
> >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> >> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> >> frames:v 10 -f framemd5 -
> >> ...
> >> 0,  0,  0,1, 12441600, 
> >> 8b8805818076b23ae6f80ec2b5a349d4
> >> 0,  1,  1,1, 12441600, 
> >> 7a7fdaa083dc337cfb6af31b643f30a3
> >> 0,  2,  2,1, 12441600, 
> >> b10ef2a1e5125cc67e262e086f8040b5
> >> 0,  3,  3,1, 12441600, 
> >> c06b53ad90e0357e537df41b63d5b1dc
> >> 0,  4,  4,1, 12441600, 
> >> 5aa2da07703859a3dee080847dd17d46
> >> 0,  5,  5,1, 12441600, 
> >> 733364c6be6af825057e905a6092937d
> >> 0,  6,  6,1, 12441600, 
> >> 47edae2dec956a582b04babb745d26b0
> >> 0,  7,  7,1, 12441600, 
> >> 4e45fe8268df4298d06a17ab8e46c3e9
> >> 0,  8,  8,1, 12441600, 
> >> 960d722a3f8787c9191299a114c04174
> >> 0,  9,  9,1, 12441600, 
> >> e759c07ee4834a9cf94bfcb4128e7612
> >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-
> mbps-
> >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> >> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> >> frames:v 10 -f framemd5 -
> >> 0,  0,  0,1, 12441600, 
> >> 8b8805818076b23ae6f80ec2b5a349d4
> >> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image overflow
> >> 2157538
> >> 0,  1,  1,1, 12441600, 
> >> bce72e10a9f1118940c5a8392ad78ec3
> >> 0,  2,  2,1, 12441600, 
> >> b10ef2a1e5125cc67e262e086f8040b5
> >> 0,  3,  3,1, 12441600, 
> >> c06b53ad90e0357e537df41b63d5b1dc
> >> 0,  4,  4,1, 12441600, 
> >> 5aa2da07703859a3dee080847dd17d46
> >> 0,  5,  5,1, 12441600, 
> >> 733364c6be6af825057e905a6092937d
> >> 0,  6,  6,1, 12441600, 
> >> 47edae2dec956a582b04babb745d26b0
> >> 0,  7,  7,1, 12441600, 
> >> 4e45fe8268df4298d06a17ab8e46c3e9
> >> 0,  8,  8,1, 12441600, 
> >> 960d722a3f8787c9191299a114c04174
> >> 0,  9,  9,1, 12441600, 
> >> e759c07ee4834a9cf94bfcb4128e7612
> >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-
> mbps-
> >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> >> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> >> frames:v 10 -f framemd5 -
> >> 0,  0,  0,1, 12441600, 
> >> 8b8805818076b23ae6f80ec2b5a349d4
> >> 0,  1,  1,1, 12441600, 
> >> 7a7fdaa083dc337cfb6af31b643f30a3
> >> [Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow
> >> 2098545
> >> 0,  2,  2,1, 12441600, 
> >> 68b390535adc5cfa0f8a7942c42a47ca
> >> 0,  3,  3,1, 12441600, 
> >> c06b53ad90e0357e537df41b63d5b1dc
> >> 0,  4,  4,1, 12441600, 
> >> 5aa2da07703859a3dee080847dd17d46
> >> 0,  5,  5,1, 12441600, 
> >> 733364c6be6af825057e905a6092937d
> >

Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-04-20 Thread Mark Thompson
On 17/04/2019 03:43, Song, Ruiling wrote:
>> -Original Message-
>> From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of
>> Mark Thompson
>> Sent: Wednesday, April 17, 2019 5:28 AM
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl
>> filter
>>
>> On 12/04/2019 16:09, Ruiling Song wrote:
>>> Signed-off-by: Ruiling Song 
>>
>> I can't work out where the problem is, but there is something really weirdly
>> nondeterministic going on here.
>>
>> E.g.
>>
>> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i 
>> ~/video/test/jellyfish-120-mbps-
>> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
>> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
>> frames:v 10 -f framemd5 -
>> ...
>> 0,  0,  0,1, 12441600, 
>> 8b8805818076b23ae6f80ec2b5a349d4
>> 0,  1,  1,1, 12441600, 
>> 7a7fdaa083dc337cfb6af31b643f30a3
>> 0,  2,  2,1, 12441600, 
>> b10ef2a1e5125cc67e262e086f8040b5
>> 0,  3,  3,1, 12441600, 
>> c06b53ad90e0357e537df41b63d5b1dc
>> 0,  4,  4,1, 12441600, 
>> 5aa2da07703859a3dee080847dd17d46
>> 0,  5,  5,1, 12441600, 
>> 733364c6be6af825057e905a6092937d
>> 0,  6,  6,1, 12441600, 
>> 47edae2dec956a582b04babb745d26b0
>> 0,  7,  7,1, 12441600, 
>> 4e45fe8268df4298d06a17ab8e46c3e9
>> 0,  8,  8,1, 12441600, 
>> 960d722a3f8787c9191299a114c04174
>> 0,  9,  9,1, 12441600, 
>> e759c07ee4834a9cf94bfcb4128e7612
>> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i 
>> ~/video/test/jellyfish-120-mbps-
>> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
>> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
>> frames:v 10 -f framemd5 -
>> 0,  0,  0,1, 12441600, 
>> 8b8805818076b23ae6f80ec2b5a349d4
>> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image overflow
>> 2157538
>> 0,  1,  1,1, 12441600, 
>> bce72e10a9f1118940c5a8392ad78ec3
>> 0,  2,  2,1, 12441600, 
>> b10ef2a1e5125cc67e262e086f8040b5
>> 0,  3,  3,1, 12441600, 
>> c06b53ad90e0357e537df41b63d5b1dc
>> 0,  4,  4,1, 12441600, 
>> 5aa2da07703859a3dee080847dd17d46
>> 0,  5,  5,1, 12441600, 
>> 733364c6be6af825057e905a6092937d
>> 0,  6,  6,1, 12441600, 
>> 47edae2dec956a582b04babb745d26b0
>> 0,  7,  7,1, 12441600, 
>> 4e45fe8268df4298d06a17ab8e46c3e9
>> 0,  8,  8,1, 12441600, 
>> 960d722a3f8787c9191299a114c04174
>> 0,  9,  9,1, 12441600, 
>> e759c07ee4834a9cf94bfcb4128e7612
>> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i 
>> ~/video/test/jellyfish-120-mbps-
>> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
>> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
>> frames:v 10 -f framemd5 -
>> 0,  0,  0,1, 12441600, 
>> 8b8805818076b23ae6f80ec2b5a349d4
>> 0,  1,  1,1, 12441600, 
>> 7a7fdaa083dc337cfb6af31b643f30a3
>> [Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow
>> 2098545
>> 0,  2,  2,1, 12441600, 
>> 68b390535adc5cfa0f8a7942c42a47ca
>> 0,  3,  3,1, 12441600, 
>> c06b53ad90e0357e537df41b63d5b1dc
>> 0,  4,  4,1, 12441600, 
>> 5aa2da07703859a3dee080847dd17d46
>> 0,  5,  5,1, 12441600, 
>> 733364c6be6af825057e905a6092937d
>> 0,  6,  6,1, 12441600, 
>> 47edae2dec956a582b04babb745d26b0
>> 0,  7,  7,1, 12441600, 
>> 4e45fe8268df4298d06a17ab8e46c3e9
>> 0,  8,  8,1, 12441600, 
>> 960d722a3f8787c9191299a114c04174
>> 0,  9,  9,1, 12441600, 
>> e759c07ee4834a9cf94bfcb4128e7612
>>
>> Frame 1 gave an overflow on the second run, and gets a different answer, then
>> frame 2 in the same way on the third run?  I can't characterise when this
>> happens, it seems to be pretty random with low probability.
> 
> I tried to reproduce on my SKL and KBL, with Beignet and Neo. And didn't 
> reproduce the issue.
&g

Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-04-16 Thread Song, Ruiling


> -Original Message-
> From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of
> Mark Thompson
> Sent: Wednesday, April 17, 2019 5:28 AM
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl
> filter
> 
> On 12/04/2019 16:09, Ruiling Song wrote:
> > Signed-off-by: Ruiling Song 
> 
> I can't work out where the problem is, but there is something really weirdly
> nondeterministic going on here.
> 
> E.g.
> 
> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-
> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> frames:v 10 -f framemd5 -
> ...
> 0,  0,  0,1, 12441600, 
> 8b8805818076b23ae6f80ec2b5a349d4
> 0,  1,  1,1, 12441600, 
> 7a7fdaa083dc337cfb6af31b643f30a3
> 0,  2,  2,1, 12441600, 
> b10ef2a1e5125cc67e262e086f8040b5
> 0,  3,  3,1, 12441600, 
> c06b53ad90e0357e537df41b63d5b1dc
> 0,  4,  4,1, 12441600, 
> 5aa2da07703859a3dee080847dd17d46
> 0,  5,  5,1, 12441600, 
> 733364c6be6af825057e905a6092937d
> 0,  6,  6,1, 12441600, 
> 47edae2dec956a582b04babb745d26b0
> 0,  7,  7,1, 12441600, 
> 4e45fe8268df4298d06a17ab8e46c3e9
> 0,  8,  8,1, 12441600, 
> 960d722a3f8787c9191299a114c04174
> 0,  9,  9,1, 12441600, 
> e759c07ee4834a9cf94bfcb4128e7612
> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-
> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> frames:v 10 -f framemd5 -
> 0,  0,  0,1, 12441600, 
> 8b8805818076b23ae6f80ec2b5a349d4
> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image overflow
> 2157538
> 0,  1,  1,1, 12441600, 
> bce72e10a9f1118940c5a8392ad78ec3
> 0,  2,  2,1, 12441600, 
> b10ef2a1e5125cc67e262e086f8040b5
> 0,  3,  3,1, 12441600, 
> c06b53ad90e0357e537df41b63d5b1dc
> 0,  4,  4,1, 12441600, 
> 5aa2da07703859a3dee080847dd17d46
> 0,  5,  5,1, 12441600, 
> 733364c6be6af825057e905a6092937d
> 0,  6,  6,1, 12441600, 
> 47edae2dec956a582b04babb745d26b0
> 0,  7,  7,1, 12441600, 
> 4e45fe8268df4298d06a17ab8e46c3e9
> 0,  8,  8,1, 12441600, 
> 960d722a3f8787c9191299a114c04174
> 0,  9,  9,1, 12441600, 
> e759c07ee4834a9cf94bfcb4128e7612
> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-
> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
> frames:v 10 -f framemd5 -
> 0,  0,  0,1, 12441600, 
> 8b8805818076b23ae6f80ec2b5a349d4
> 0,  1,  1,1, 12441600, 
> 7a7fdaa083dc337cfb6af31b643f30a3
> [Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow
> 2098545
> 0,  2,  2,1, 12441600, 
> 68b390535adc5cfa0f8a7942c42a47ca
> 0,  3,  3,1, 12441600, 
> c06b53ad90e0357e537df41b63d5b1dc
> 0,  4,  4,1, 12441600, 
> 5aa2da07703859a3dee080847dd17d46
> 0,  5,  5,1, 12441600, 
> 733364c6be6af825057e905a6092937d
> 0,  6,  6,1, 12441600, 
> 47edae2dec956a582b04babb745d26b0
> 0,  7,  7,1, 12441600, 
> 4e45fe8268df4298d06a17ab8e46c3e9
> 0,  8,  8,1, 12441600, 
> 960d722a3f8787c9191299a114c04174
> 0,  9,  9,1, 12441600, 
> e759c07ee4834a9cf94bfcb4128e7612
> 
> Frame 1 gave an overflow on the second run, and gets a different answer, then
> frame 2 in the same way on the third run?  I can't characterise when this
> happens, it seems to be pretty random with low probability.

I tried to reproduce on my SKL and KBL, with Beignet and Neo. And didn't 
reproduce the issue.
As I am encountering some network issue, I didn't get the video sample you 
provide (I am using https://4ksamples.com/ses-astra-uhd-test-2-2160p-uhdtv/ ), 
I can try later to download the same video as you.
May be an OpenCL driver issue? I am not sure yet. So could you provide what 
hardware and opencl driver version you are using? So I can do some debugging if 
possible.

> 
> (Input here is a 4K file from <http://jell.yfish.us/>, but I don't think it 
&

Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-04-16 Thread Mark Thompson
On 12/04/2019 16:09, Ruiling Song wrote:
> Signed-off-by: Ruiling Song 

I can't work out where the problem is, but there is something really weirdly 
nondeterministic going on here.

E.g.

$ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i 
~/video/test/jellyfish-120-mbps-4k-uhd-hevc-10bit.mkv -an -filter_hw_device 
opencl0 -vf format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p 
-frames:v 10 -f framemd5 -
...
0,  0,  0,1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
0,  1,  1,1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3
0,  2,  2,1, 12441600, b10ef2a1e5125cc67e262e086f8040b5
0,  3,  3,1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
0,  4,  4,1, 12441600, 5aa2da07703859a3dee080847dd17d46
0,  5,  5,1, 12441600, 733364c6be6af825057e905a6092937d
0,  6,  6,1, 12441600, 47edae2dec956a582b04babb745d26b0
0,  7,  7,1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
0,  8,  8,1, 12441600, 960d722a3f8787c9191299a114c04174
0,  9,  9,1, 12441600, e759c07ee4834a9cf94bfcb4128e7612
$ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i 
~/video/test/jellyfish-120-mbps-4k-uhd-hevc-10bit.mkv -an -filter_hw_device 
opencl0 -vf format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p 
-frames:v 10 -f framemd5 -
0,  0,  0,1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
[Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image overflow 2157538
0,  1,  1,1, 12441600, bce72e10a9f1118940c5a8392ad78ec3
0,  2,  2,1, 12441600, b10ef2a1e5125cc67e262e086f8040b5
0,  3,  3,1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
0,  4,  4,1, 12441600, 5aa2da07703859a3dee080847dd17d46
0,  5,  5,1, 12441600, 733364c6be6af825057e905a6092937d
0,  6,  6,1, 12441600, 47edae2dec956a582b04babb745d26b0
0,  7,  7,1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
0,  8,  8,1, 12441600, 960d722a3f8787c9191299a114c04174
0,  9,  9,1, 12441600, e759c07ee4834a9cf94bfcb4128e7612
$ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i 
~/video/test/jellyfish-120-mbps-4k-uhd-hevc-10bit.mkv -an -filter_hw_device 
opencl0 -vf format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p 
-frames:v 10 -f framemd5 -
0,  0,  0,1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
0,  1,  1,1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3
[Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow 2098545
0,  2,  2,1, 12441600, 68b390535adc5cfa0f8a7942c42a47ca
0,  3,  3,1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
0,  4,  4,1, 12441600, 5aa2da07703859a3dee080847dd17d46
0,  5,  5,1, 12441600, 733364c6be6af825057e905a6092937d
0,  6,  6,1, 12441600, 47edae2dec956a582b04babb745d26b0
0,  7,  7,1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
0,  8,  8,1, 12441600, 960d722a3f8787c9191299a114c04174
0,  9,  9,1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

Frame 1 gave an overflow on the second run, and gets a different answer, then 
frame 2 in the same way on the third run?  I can't characterise when this 
happens, it seems to be pretty random with low probability.

(Input here is a 4K file from , but I don't think it 
matters - I saw it with others sometimes as well.)

>  configure   |   1 +
>  doc/filters.texi|   4 +
>  libavfilter/Makefile|   1 +
>  libavfilter/allfilters.c|   1 +
>  libavfilter/opencl/nlmeans.cl   | 115 +
>  libavfilter/opencl_source.h |   1 +
>  libavfilter/vf_nlmeans_opencl.c | 442 
>  7 files changed, 565 insertions(+)
>  create mode 100644 libavfilter/opencl/nlmeans.cl
>  create mode 100644 libavfilter/vf_nlmeans_opencl.c

Code all looks fine, as far as I can tell.

Thanks,

- Mark
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl filter

2019-04-12 Thread Ruiling Song
Signed-off-by: Ruiling Song 
---
 configure   |   1 +
 doc/filters.texi|   4 +
 libavfilter/Makefile|   1 +
 libavfilter/allfilters.c|   1 +
 libavfilter/opencl/nlmeans.cl   | 115 +
 libavfilter/opencl_source.h |   1 +
 libavfilter/vf_nlmeans_opencl.c | 442 
 7 files changed, 565 insertions(+)
 create mode 100644 libavfilter/opencl/nlmeans.cl
 create mode 100644 libavfilter/vf_nlmeans_opencl.c

diff --git a/configure b/configure
index 0cdf0ffa8a..93ebfd6784 100755
--- a/configure
+++ b/configure
@@ -3461,6 +3461,7 @@ mpdecimate_filter_select="pixelutils"
 minterpolate_filter_select="scene_sad"
 mptestsrc_filter_deps="gpl"
 negate_filter_deps="lut_filter"
+nlmeans_opencl_filter_deps="opencl"
 nnedi_filter_deps="gpl"
 ocr_filter_deps="libtesseract"
 ocv_filter_deps="libopencv"
diff --git a/doc/filters.texi b/doc/filters.texi
index 867607d870..21c2c1a4b5 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -19030,6 +19030,10 @@ Apply erosion filter with threshold0 set to 30, 
threshold1 set 40, threshold2 se
 @end example
 @end itemize
 
+@section nlmeans_opencl
+
+Non-local Means denoise filter through OpenCL, this filter accepts same 
options as @ref{nlmeans}.
+
 @section overlay_opencl
 
 Overlay one video on top of another.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index fef6ec5c55..92039bfdcf 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -291,6 +291,7 @@ OBJS-$(CONFIG_MIX_FILTER)+= vf_mix.o
 OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
+OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o 
opencl/nlmeans.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)   += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index c51ae0f3c7..2a6390c92d 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -277,6 +277,7 @@ extern AVFilter ff_vf_mix;
 extern AVFilter ff_vf_mpdecimate;
 extern AVFilter ff_vf_negate;
 extern AVFilter ff_vf_nlmeans;
+extern AVFilter ff_vf_nlmeans_opencl;
 extern AVFilter ff_vf_nnedi;
 extern AVFilter ff_vf_noformat;
 extern AVFilter ff_vf_noise;
diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
new file mode 100644
index 00..72bd681fd6
--- /dev/null
+++ b/libavfilter/opencl/nlmeans.cl
@@ -0,0 +1,115 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+   CLK_ADDRESS_CLAMP_TO_EDGE   |
+   CLK_FILTER_NEAREST);
+
+kernel void horiz_sum(__global uint4 *integral_img,
+  __read_only image2d_t src,
+  int width,
+  int height,
+  int4 dx,
+  int4 dy)
+{
+
+int y = get_global_id(0);
+int work_size = get_global_size(0);
+
+uint4 sum = (uint4)(0);
+float4 s2;
+for (int i = 0; i < width; i++) {
+float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
+s2.x = read_imagef(src, sampler, (int2)(i + dx.x, y + dy.x)).x;
+s2.y = read_imagef(src, sampler, (int2)(i + dx.y, y + dy.y)).x;
+s2.z = read_imagef(src, sampler, (int2)(i + dx.z, y + dy.z)).x;
+s2.w = read_imagef(src, sampler, (int2)(i + dx.w, y + dy.w)).x;
+sum += convert_uint4((s1 - s2) * (s1 - s2) * 255 * 255);
+integral_img[y * width + i] = sum;
+}
+}
+
+kernel void vert_sum(__global uint4 *integral_img,
+ __global int *overflow,
+ int width,
+ int height)
+{
+int x = get_global_id(0);
+uint4 sum = 0;
+for (int i = 0; i < height; i++) {
+if (any((uint4)UINT_MAX - integral_img[i * width + x] < sum))
+atomic_inc(overflow);
+integral_img[i * width + x] += sum;
+sum = integral_img[i * width + x];
+}
+}
+
+kernel void weight_accum(global float