[FFmpeg-devel] [PATCH] avfilter/WIP: add nlmeans filter

2016-09-18 Thread Clément Bœsch
Fixes Ticket #4910

TODO (before 1st pushed version):
- add chroma paramters (for patch size and research window)
- doc
- lavfi minor bump, Changelog
- GBRP

TODO++ (after 1st version):
- SIMD for compute_safe_ssd_integral_image
- SIMD for final weighted averaging
- smart parameters for block and research size according to sigma like
  suggested by ipol?
- temporal support
---
 libavfilter/Makefile |   3 +-
 libavfilter/allfilters.c |   1 +
 libavfilter/tests/integral.c |  89 
 libavfilter/vf_nlmeans.c | 527 +++
 4 files changed, 619 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/tests/integral.c
 create mode 100644 libavfilter/vf_nlmeans.c

diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 5cd10fa..57a38d3 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -220,6 +220,7 @@ OBJS-$(CONFIG_METADATA_FILTER)   += f_metadata.o
 OBJS-$(CONFIG_MINTERPOLATE_FILTER)   += vf_minterpolate.o 
motion_estimation.o
 OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o
+OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)   += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)  += vf_noise.o
@@ -354,7 +355,7 @@ SKIPHEADERS-$(CONFIG_OPENCL) += 
opencl_internal.h deshake_opencl
 OBJS-$(CONFIG_SHARED)+= log2_tab.o
 
 TOOLS = graph2dot
-TESTPROGS = drawutils filtfmts formats
+TESTPROGS = drawutils filtfmts formats integral
 
 TOOLS-$(CONFIG_LIBZMQ) += zmqsend
 
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 47d95f5..8f542fd 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -237,6 +237,7 @@ void avfilter_register_all(void)
 REGISTER_FILTER(MINTERPOLATE,   minterpolate,   vf);
 REGISTER_FILTER(MPDECIMATE, mpdecimate, vf);
 REGISTER_FILTER(NEGATE, negate, vf);
+REGISTER_FILTER(NLMEANS,nlmeans,vf);
 REGISTER_FILTER(NNEDI,  nnedi,  vf);
 REGISTER_FILTER(NOFORMAT,   noformat,   vf);
 REGISTER_FILTER(NOISE,  noise,  vf);
diff --git a/libavfilter/tests/integral.c b/libavfilter/tests/integral.c
new file mode 100644
index 000..7690254
--- /dev/null
+++ b/libavfilter/tests/integral.c
@@ -0,0 +1,89 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavfilter/vf_nlmeans.c"
+
+int main(void)
+{
+int ret = 0, xoff, yoff;
+
+// arbitrary test source of size 6x4 and linesize=8
+const int w = 6, h = 5, lz = 8;
+static const uint8_t src[] = {
+0xb0, 0x71, 0xfb, 0xd8, 0x01, 0xd9, /***/ 0x01, 0x02,
+0x51, 0x8e, 0x41, 0x0f, 0x84, 0x58, /***/ 0x03, 0x04,
+0xc7, 0x8d, 0x07, 0x70, 0x5c, 0x47, /***/ 0x05, 0x06,
+0x09, 0x4e, 0xfc, 0x74, 0x8f, 0x9a, /***/ 0x07, 0x08,
+0x60, 0x8e, 0x20, 0xaa, 0x95, 0x7d, /***/ 0x09, 0x0a,
+};
+
+const int r = 3;
+const int ii_w = w+r*2, ii_h = h+r*2;
+
+// align to 4 the linesize, "+1" is for the space of the left 0-column
+const int ii_lz_32 = ((ii_w + 1) + 3) & ~3;
+
+// "+1" is for the space of the top 0-line
+uint32_t *ii  = calloc(ii_h + 1, ii_lz_32 * sizeof(*ii));
+uint32_t *ii2 = calloc(ii_h + 1, ii_lz_32 * sizeof(*ii2));
+
+uint32_t *ii_start  = ii  + ii_lz_32 + 1; // skip top 0-line and left 
0-column
+uint32_t *ii_start2 = ii2 + ii_lz_32 + 1; // skip top 0-line and left 
0-column
+
+for (yoff = -r; yoff <= r; yoff++) {
+for (xoff = -r; xoff <= r; xoff++) {
+int x, y;
+
+printf("xoff=%d yoff=%d\n", xoff, yoff);
+
+compute_ssd_integral_image(ii_start, ii_lz_32,
+   src, lz, xoff, yoff, r, w, h);
+
+for (y = 0; y < ii_h; y++) {
+for (x = 0; x < ii_w; x++)
+printf(" %7x", ii_start[y*ii_lz_32 + x]);
+printf("\n");
+}
+printf("

Re: [FFmpeg-devel] [PATCH] avfilter/WIP: add nlmeans filter

2016-09-18 Thread Carl Eugen Hoyos
2016-09-18 13:36 GMT+02:00 Clément Bœsch :
> Fixes Ticket #4910
>
> TODO (before 1st pushed version):
> - add chroma paramters (for patch size and research window)
> - doc
> - lavfi minor bump, Changelog

> - GBRP

Is this really important for the first version?

Carl Eugen
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter/WIP: add nlmeans filter

2016-09-18 Thread Clément Bœsch
On Sun, Sep 18, 2016 at 06:49:03PM +0200, Carl Eugen Hoyos wrote:
> 2016-09-18 13:36 GMT+02:00 Clément Bœsch :
> > Fixes Ticket #4910
> >
> > TODO (before 1st pushed version):
> > - add chroma paramters (for patch size and research window)
> > - doc
> > - lavfi minor bump, Changelog
> 
> > - GBRP
> 
> Is this really important for the first version?
> 

It's a matter of adding just one line and checking if it works, which I
just did :)

-- 
Clément B.
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avfilter: add nlmeans filter

2016-09-20 Thread Clément Bœsch
Fixes Ticket #4910
---
I actually tried to implement the better defaults suggestion from ipol (see
@todo) but it wasn't convincing; probably because of different scales, so I
need to investigate.

Also, integral is still inplace in the filter for now as I didn't find a clean
way of testing it outside the filter without a long trip in dependency hell. I
think it can wait until the SIMD are implemented and the need to expose it
comes up.

I've made several changes from the initial WIP. The most important one is the
fix in the patch distance calculation, followed by the the addition of chroma
parameters.

I believe the filter is ready for integration as a first version.

Two interesting examples: http://imgur.com/a/XXhJP
---
 Changelog|   1 +
 doc/filters.texi |  35 +++
 libavfilter/Makefile |   3 +-
 libavfilter/allfilters.c |   1 +
 libavfilter/tests/integral.c |  92 
 libavfilter/version.h|   2 +-
 libavfilter/vf_nlmeans.c | 548 +++
 7 files changed, 680 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/tests/integral.c
 create mode 100644 libavfilter/vf_nlmeans.c

diff --git a/Changelog b/Changelog
index 2d0a449..a5282b4 100644
--- a/Changelog
+++ b/Changelog
@@ -31,6 +31,7 @@ version :
 - MediaCodec HEVC decoding
 - TrueHD encoder
 - Meridian Lossless Packing (MLP) encoder
+- nlmeans filter (denoiser)
 
 
 version 3.1:
diff --git a/doc/filters.texi b/doc/filters.texi
index 070e57d..7e9ab60 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -9695,6 +9695,41 @@ Negate input video.
 It accepts an integer in input; if non-zero it negates the
 alpha component (if available). The default value in input is 0.
 
+@section nlmeans
+
+Denoise frames using Non-Local Means algorithm.
+
+Each pixel is adjusted by looking for other pixels with similar contexts. This
+context similarity is defined by their surrounding patch of size
+@option{p}x@option{p}. Patches are researched in an area of
+@option{r}x@option{r} surrouding the pixel.
+
+Note that the research area defines centers for patches, which means some
+patches will be made of pixels outside that research area.
+
+The filter accepts the following options.
+
+@table @option
+@item s
+Set denoising strength.
+
+@item p
+Set patch size.
+
+@item pc
+Same as @option{p} but for chroma planes.
+
+The default value is @var{0} and means automatic.
+
+@item r
+Set research size.
+
+@item rc
+Same as @option{r} but for chroma planes.
+
+The default value is @var{0} and means automatic.
+@end table
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 5cd10fa..57a38d3 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -220,6 +220,7 @@ OBJS-$(CONFIG_METADATA_FILTER)   += f_metadata.o
 OBJS-$(CONFIG_MINTERPOLATE_FILTER)   += vf_minterpolate.o 
motion_estimation.o
 OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o
+OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)   += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)  += vf_noise.o
@@ -354,7 +355,7 @@ SKIPHEADERS-$(CONFIG_OPENCL) += 
opencl_internal.h deshake_opencl
 OBJS-$(CONFIG_SHARED)+= log2_tab.o
 
 TOOLS = graph2dot
-TESTPROGS = drawutils filtfmts formats
+TESTPROGS = drawutils filtfmts formats integral
 
 TOOLS-$(CONFIG_LIBZMQ) += zmqsend
 
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 47d95f5..8f542fd 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -237,6 +237,7 @@ void avfilter_register_all(void)
 REGISTER_FILTER(MINTERPOLATE,   minterpolate,   vf);
 REGISTER_FILTER(MPDECIMATE, mpdecimate, vf);
 REGISTER_FILTER(NEGATE, negate, vf);
+REGISTER_FILTER(NLMEANS,nlmeans,vf);
 REGISTER_FILTER(NNEDI,  nnedi,  vf);
 REGISTER_FILTER(NOFORMAT,   noformat,   vf);
 REGISTER_FILTER(NOISE,  noise,  vf);
diff --git a/libavfilter/tests/integral.c b/libavfilter/tests/integral.c
new file mode 100644
index 000..8a2286b
--- /dev/null
+++ b/libavfilter/tests/integral.c
@@ -0,0 +1,92 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

Re: [FFmpeg-devel] [PATCH] avfilter: add nlmeans filter

2016-09-21 Thread Benoit Fouet

Hi,


On 20/09/2016 21:52, Clément Bœsch wrote:

Fixes Ticket #4910
---
I actually tried to implement the better defaults suggestion from ipol (see
@todo) but it wasn't convincing; probably because of different scales, so I
need to investigate.

Also, integral is still inplace in the filter for now as I didn't find a clean
way of testing it outside the filter without a long trip in dependency hell. I
think it can wait until the SIMD are implemented and the need to expose it
comes up.

I've made several changes from the initial WIP. The most important one is the
fix in the patch distance calculation, followed by the the addition of chroma
parameters.

I believe the filter is ready for integration as a first version.

Two interesting examples: http://imgur.com/a/XXhJP
---
  Changelog|   1 +
  doc/filters.texi |  35 +++
  libavfilter/Makefile |   3 +-
  libavfilter/allfilters.c |   1 +
  libavfilter/tests/integral.c |  92 
  libavfilter/version.h|   2 +-
  libavfilter/vf_nlmeans.c | 548 +++
  7 files changed, 680 insertions(+), 2 deletions(-)
  create mode 100644 libavfilter/tests/integral.c
  create mode 100644 libavfilter/vf_nlmeans.c

diff --git a/Changelog b/Changelog
index 2d0a449..a5282b4 100644
--- a/Changelog
+++ b/Changelog
@@ -31,6 +31,7 @@ version :
  - MediaCodec HEVC decoding
  - TrueHD encoder
  - Meridian Lossless Packing (MLP) encoder
+- nlmeans filter (denoiser)
  


The full name could be used here: Non-Local Means (nlmeans) denoising filter

  
  version 3.1:

diff --git a/doc/filters.texi b/doc/filters.texi
index 070e57d..7e9ab60 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -9695,6 +9695,41 @@ Negate input video.
  It accepts an integer in input; if non-zero it negates the
  alpha component (if available). The default value in input is 0.
  
+@section nlmeans

+
+Denoise frames using Non-Local Means algorithm.
+
+Each pixel is adjusted by looking for other pixels with similar contexts. This
+context similarity is defined by their surrounding patch of size


"is defined by comparing their surrounding patches" ?


+@option{p}x@option{p}. Patches are researched in an area of
+@option{r}x@option{r} surrouding the pixel.
+


surrounding, or even simply "around"
Also "research" sounds weird (I'd use "search"), but maybe wait for 
someone native to comment


[...]


diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
new file mode 100644
index 000..f923f80
--- /dev/null
+++ b/libavfilter/vf_nlmeans.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 2016 Clément Bœsch 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @todo
+ * - SIMD for compute_safe_ssd_integral_image
+ * - SIMD for final weighted averaging
+ * - better automatic defaults? see "Parameters" @ 
http://www.ipol.im/pub/art/2011/bcm_nlm/
+ * - temporal support (probably doesn't need any displacement according to
+ *   "Denoising image sequences does not require motion estimation")
+ * - bayer support?
+ * - FATE test (probably needs visual threshold test mechanism due to the use 
of floats)
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+struct weighted_avg {
+double total_weight;
+double sum;
+};
+
+#define WEIGHT_LUT_NBITS 9
+#define WEIGHT_LUT_SIZE  (1<

nit: could you use _uv instead of _c (the latter has a taste of C vs ASM)?


+int research_size,   research_hsize;// research size and half size
+int research_size_c, research_hsize_c;  // research size and half size for 
chroma planes
+uint32_t *ii_orig;  // integral image
+uint32_t *ii;   // integral image starting after 
the 0-line and 0-column
+int ii_w, ii_h; // width and height of the 
integral image
+int ii_lz_32;   // linesize in 32-bit units of the 
integral image
+struct weighted_avg *wa;

Re: [FFmpeg-devel] [PATCH] avfilter: add nlmeans filter

2016-09-24 Thread Clément Bœsch
On Wed, Sep 21, 2016 at 02:39:55PM +0200, Benoit Fouet wrote:
[...]
> > diff --git a/Changelog b/Changelog
> > index 2d0a449..a5282b4 100644
> > --- a/Changelog
> > +++ b/Changelog
> > @@ -31,6 +31,7 @@ version :
> >   - MediaCodec HEVC decoding
> >   - TrueHD encoder
> >   - Meridian Lossless Packing (MLP) encoder
> > +- nlmeans filter (denoiser)
> 
> The full name could be used here: Non-Local Means (nlmeans) denoising filter
> 

changed.

> >   version 3.1:
> > diff --git a/doc/filters.texi b/doc/filters.texi
> > index 070e57d..7e9ab60 100644
> > --- a/doc/filters.texi
> > +++ b/doc/filters.texi
> > @@ -9695,6 +9695,41 @@ Negate input video.
> >   It accepts an integer in input; if non-zero it negates the
> >   alpha component (if available). The default value in input is 0.
> > +@section nlmeans
> > +
> > +Denoise frames using Non-Local Means algorithm.
> > +
> > +Each pixel is adjusted by looking for other pixels with similar contexts. 
> > This
> > +context similarity is defined by their surrounding patch of size
> 
> "is defined by comparing their surrounding patches" ?
> 

OK

> > +@option{p}x@option{p}. Patches are researched in an area of
> > +@option{r}x@option{r} surrouding the pixel.
> > +
> 
> surrounding, or even simply "around"
> Also "research" sounds weird (I'd use "search"), but maybe wait for someone
> native to comment
> 

Sounds good, changed.

[...]
> > +int patch_size,   patch_hsize;  // patch size and half size
> > +int patch_size_c, patch_hsize_c;// patch size and half size 
> > for chroma planes
> 
> nit: could you use _uv instead of _c (the latter has a taste of C vs ASM)?
> 

Yeah, renamed.

[...]
> > +static const AVOption nlmeans_options[] = {
> > +{ "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { 
> > .dbl = 1.0 }, 1.0, 30.0, FLAGS },
> > +{ "p",  "patch size",   OFFSET(patch_size),   
> > AV_OPT_TYPE_INT, { .i64 = 3*2+1 }, 0, 99, FLAGS },
> > +{ "pc", "patch size for chroma planes", OFFSET(patch_size_c), 
> > AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
> > +{ "r",  "research window",   OFFSET(research_size),   
> > AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
> 
> Nit: if there is a correlation between the default patch size and research
> window size, maybe add defines? Like:
> #define NLMEANS_DEFAULT_PATCH_SIZE (3*2+1)
> #define NLMEANS_DEFAULT_RESEARCH_WINDOW_SIZE (NLMEANS_DEFAULT_PATCH_SIZE +
> 1)

No real relationship here actually.

[...]
> > +static inline int get_ssd_patch(const uint32_t *ii, int ii_lz_32, int x, 
> > int y, int p)
> 
> Actually, this is not really about SSD value here. This function does not
> really care about what has been integrated, it just compute the patch value
> only by knowing the buffer is an integral image.
> get_patch_value (maybe too much generic)?
> Anyway... this is nitpicking, feel free to just ignore, I just felt I needed
> to explain a bit more why I wanted another name :-)
> 

Renamed to get_integral_patch_value()

[...]
> > + * The above line and left column of dst are always readable.
> > + *
> 
> The line above dst and the column to its left are always readable.
> 

changed

[...]
> > + * On the other hand, the above line and left column of dst are still 
> > always
> > + * readable.
> > + *
> 
> same
> 

also changed

[...]
> > +for (x = startx; x < startx + w; x++) {
> > +const int s1x = av_clip(x -  r, 0, sw - 1);
> > +const int s2x = av_clip(x - (r + offx), 0, sw - 1);
> > +const int s1y = av_clip(y -  r, 0, sh - 1);
> > +const int s2y = av_clip(y - (r + offy), 0, sh - 1);
> > +const uint8_t v1 = src[s1y*linesize + s1x];
> > +const uint8_t v2 = src[s2y*linesize + s2x];
> > +const int d = v1 - v2;
> > +acc += d * d;
> > +dst[y*dst_linesize_32 + x] = dst[(y-1)*dst_linesize_32 + x] + 
> > acc;
> > +}
> 
> I can understand this is done on smaller portions, but it would still be
> good to (at least) move the y-only parts out of the x loop.
> 

Yeah sure, moved s1y and s2y out of the x loop

[...]
> > +// allocate weighted average for every pixel
> > +s->wa_linesize = inlink->w;
> > +s->wa = av_malloc_array(s->wa_linesize, inlink->h * sizeof(*s->wa));
> > +if (!s->wa)
> > +return AVERROR(ENOMEM);
> 
> this leaks s->ii_orig
> 

No that's fine, uninit() is called when init() fails

> > +static av_cold void uninit(AVFilterContext *ctx)
> > +{
> > +NLMeansContext *s = ctx->priv;
> > +av_freep(&s->ii_orig);
> 
> s->wa also needs to be freed
> 

Ah I fixed that in another branch but forgot to backport. Fixed.

Patch applied, thanks for the review

-- 
Clément B.
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 1/9] lavfi/nlmeans: random code shuffling to help compiler

2018-05-06 Thread Clément Bœsch
This makes nlmeans_slice() slightly faster at least on GCC 7.3.
---
 libavfilter/vf_nlmeans.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index e4952e187e..d222d3913e 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -368,7 +368,6 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 int x, y;
 NLMeansContext *s = ctx->priv;
 const struct thread_data *td = arg;
-const uint8_t *src = td->src;
 const int src_linesize = td->src_linesize;
 const int process_h = td->endy - td->starty;
 const int slice_start = (process_h *  jobnr   ) / nb_jobs;
@@ -377,14 +376,15 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 const int endy   = td->starty + slice_end;
 
 for (y = starty; y < endy; y++) {
+const uint8_t *src = td->src + y*src_linesize;
+struct weighted_avg *wa = s->wa + y*s->wa_linesize;
 for (x = td->startx; x < td->endx; x++) {
 const int patch_diff_sq = get_integral_patch_value(td->ii_start, 
s->ii_lz_32, x, y, td->p);
 if (patch_diff_sq < s->max_meaningful_diff) {
-struct weighted_avg *wa = &s->wa[y*s->wa_linesize + x];
 const int weight_lut_idx = patch_diff_sq * s->pdiff_lut_scale;
 const double weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
-wa->total_weight += weight;
-wa->sum += weight * src[y*src_linesize + x];
+wa[x].total_weight += weight;
+wa[x].sum += weight * src[x];
 }
 }
 }
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] Misc improvements in nlmeans filter

2018-05-06 Thread Clément Bœsch
The biggest change is the introduction of the dsp infrastructure such
that more SIMD can be added, in particular x86 version(s) of the
integral computation function. Only aarch64 was added so far (because
the ASM is easy), and I don't plan to work on other arch for now.

The filter is still pretty slow, so I'm open to suggestions.

Regards,


___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 2/9] lavfi/nlmeans: add SIMD-friendly assumptions for compute_safe_ssd_integral_image

2018-05-06 Thread Clément Bœsch
SIMD code will not have to deal with padding itself. Overwriting in that
function may have been possible but involve large overreading of the
sources. Instead, we simply make sure the width to process is always a
multiple of 16. Additionally, there must be some actual area to process
so the SIMD code can have its boundary checks after processing the first
pixels.
---
 libavfilter/vf_nlmeans.c | 25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index d222d3913e..21f981a605 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
*dst, int dst_linesize_32
 {
 int x, y;
 
+/* SIMD-friendly assumptions allowed here */
+av_assert2(!(w & 0xf) && w >= 16 && h >= 1);
+
 for (y = 0; y < h; y++) {
 uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
 
@@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, int 
ii_linesize_32,
 // to compare the 2 sources pixels
 const int startx_safe = FFMAX(s1x, s2x);
 const int starty_safe = FFMAX(s1y, s2y);
-const int endx_safe   = FFMIN(s1x + w, s2x + w);
+const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned
 const int endy_safe   = FFMIN(s1y + h, s2y + h);
 
+// deduce the safe area width and height
+const int safe_pw = (u_endx_safe - startx_safe) & ~0xf;
+const int safe_ph = endy_safe - starty_safe;
+
+// adjusted end x position of the safe area after width of the safe area 
gets aligned
+const int endx_safe = startx_safe + safe_pw;
+
 // top part where only one of s1 and s2 is still readable, or none at all
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
   0, 0,
@@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, int 
ii_linesize_32,
   0, starty_safe,
   src, linesize,
   offx, offy, e, w, h,
-  startx_safe, endy_safe - starty_safe);
+  startx_safe, safe_ph);
 
 // main and safe part of the integral
 av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w);
 av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h);
 av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w);
 av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h);
-compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + 
startx_safe, ii_linesize_32,
-  src + (starty_safe - s1y) * linesize + 
(startx_safe - s1x), linesize,
-  src + (starty_safe - s2y) * linesize + 
(startx_safe - s2x), linesize,
-  endx_safe - startx_safe, endy_safe - 
starty_safe);
+if (safe_pw && safe_ph)
+dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 + 
startx_safe, ii_linesize_32,
+ src + (starty_safe - s1y) * 
linesize + (startx_safe - s1x), linesize,
+ src + (starty_safe - s2y) * 
linesize + (startx_safe - s2x), linesize,
+ safe_pw, safe_ph);
 
 // right part of the integral
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
   endx_safe, starty_safe,
   src, linesize,
   offx, offy, e, w, h,
-  ii_w - endx_safe, endy_safe - 
starty_safe);
+  ii_w - endx_safe, safe_ph);
 
 // bottom part where only one of s1 and s2 is still readable, or none at 
all
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 4/9] lavfi/nlmeans: add AArch64 SIMD for compute_safe_ssd_integral_image

2018-05-06 Thread Clément Bœsch
ssd_integral_image_c: 49204.6
ssd_integral_image_neon: 28346.8
---
 libavfilter/aarch64/Makefile  |  3 ++
 libavfilter/aarch64/vf_nlmeans_init.c | 33 
 libavfilter/aarch64/vf_nlmeans_neon.S | 78 +++
 libavfilter/vf_nlmeans.c  | 18 +--
 libavfilter/vf_nlmeans.h  | 35 
 5 files changed, 164 insertions(+), 3 deletions(-)
 create mode 100644 libavfilter/aarch64/Makefile
 create mode 100644 libavfilter/aarch64/vf_nlmeans_init.c
 create mode 100644 libavfilter/aarch64/vf_nlmeans_neon.S
 create mode 100644 libavfilter/vf_nlmeans.h

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
new file mode 100644
index 00..b58daa3a3f
--- /dev/null
+++ b/libavfilter/aarch64/Makefile
@@ -0,0 +1,3 @@
+OBJS-$(CONFIG_NLMEANS_FILTER)+= aarch64/vf_nlmeans_init.o
+
+NEON-OBJS-$(CONFIG_NLMEANS_FILTER)   += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/vf_nlmeans_init.c 
b/libavfilter/aarch64/vf_nlmeans_init.c
new file mode 100644
index 00..a1edefb144
--- /dev/null
+++ b/libavfilter/aarch64/vf_nlmeans_init.c
@@ -0,0 +1,33 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/vf_nlmeans.h"
+
+void ff_compute_safe_ssd_integral_image_neon(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
+ const uint8_t *s1, ptrdiff_t 
linesize1,
+ const uint8_t *s2, ptrdiff_t 
linesize2,
+ int w, int h);
+
+av_cold void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp)
+{
+int cpu_flags = av_get_cpu_flags();
+
+if (have_neon(cpu_flags))
+dsp->compute_safe_ssd_integral_image = 
ff_compute_safe_ssd_integral_image_neon;
+}
diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
b/libavfilter/aarch64/vf_nlmeans_neon.S
new file mode 100644
index 00..4de573cf7f
--- /dev/null
+++ b/libavfilter/aarch64/vf_nlmeans_neon.S
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 Clément Bœsch 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
+.macro acc_sum_store x, xb
+dup v24.4S, v24.4S[3]   // 
...X -> 
+ext v25.16B, v26.16B, \xb, #12  // 
ext(,ABCD,12)=0ABC
+add v24.4S, v24.4S, \x  // 
+ABCD={X+A,X+B,X+C,X+D}
+add v24.4S, v24.4S, v25.4S  // 
{X+A,X+B+A,X+C+B,X+D+C}   (+0ABC)
+ext v25.16B, v26.16B, v25.16B, #12  // 
ext(,0ABC,12)=00AB
+add v24.4S, v24.4S, v25.4S  // 
{X+A,X+B+A,X+C+B+A,X+D+C+B}   (+00AB)
+ext v25.16B, v26.16B, v25.16B, #12  // 
ext(,00AB,12)=000A
+add v24.4S, v24.4S, v25.4S  // 
{X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
+st1 {v24.4S}, [x0], #16 // 
write 4x32-bit final values
+.endm
+
+function ff_compute_safe_ssd_integral_image_neon, export=1
+moviv26.4S, #0  // 
used as zero for the "rotations" in acc_sum_store
+sub x3, x3, w6, UXTW// s1 
padding (s1_linesize - w)

[FFmpeg-devel] [PATCH 3/9] lavfi/nlmeans: use ptrdiff_t for linesizes

2018-05-06 Thread Clément Bœsch
Similarly to previous commit, this will help writing SIMD code by not
having manual zero-extension in SIMD code
---
 libavfilter/vf_nlmeans.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index 21f981a605..4119fa3e01 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -60,9 +60,9 @@ typedef struct NLMeansContext {
 uint32_t *ii_orig;  // integral image
 uint32_t *ii;   // integral image starting 
after the 0-line and 0-column
 int ii_w, ii_h; // width and height of the 
integral image
-int ii_lz_32;   // linesize in 32-bit units of 
the integral image
+ptrdiff_t ii_lz_32; // linesize in 32-bit units of 
the integral image
 struct weighted_avg *wa;// weighted average of every 
pixel
-int wa_linesize;// linesize for wa in struct 
size unit
+ptrdiff_t wa_linesize;  // linesize for wa in struct 
size unit
 double weight_lut[WEIGHT_LUT_SIZE]; // lookup table mapping 
(scaled) patch differences to their associated weights
 double pdiff_lut_scale; // scale factor for patch 
differences before looking into the LUT
 int max_meaningful_diff;// maximum difference 
considered (if the patch difference is too high we ignore the pixel)
@@ -150,9 +150,9 @@ static inline int get_integral_patch_value(const uint32_t 
*ii, int ii_lz_32, int
  * while for SIMD implementation it is likely more interesting to use the
  * two-loops algorithm variant.
  */
-static void compute_safe_ssd_integral_image_c(uint32_t *dst, int 
dst_linesize_32,
-  const uint8_t *s1, int linesize1,
-  const uint8_t *s2, int linesize2,
+static void compute_safe_ssd_integral_image_c(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
+  const uint8_t *s1, ptrdiff_t 
linesize1,
+  const uint8_t *s2, ptrdiff_t 
linesize2,
   int w, int h)
 {
 int x, y;
@@ -198,9 +198,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
*dst, int dst_linesize_32
  * @param w width to compute
  * @param h height to compute
  */
-static inline void compute_unsafe_ssd_integral_image(uint32_t *dst, int 
dst_linesize_32,
+static inline void compute_unsafe_ssd_integral_image(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
  int startx, int starty,
- const uint8_t *src, int 
linesize,
+ const uint8_t *src, 
ptrdiff_t linesize,
  int offx, int offy, int 
r, int sw, int sh,
  int w, int h)
 {
@@ -240,8 +240,8 @@ static inline void 
compute_unsafe_ssd_integral_image(uint32_t *dst, int dst_line
  * @param h source height
  * @param e research padding edge
  */
-static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32,
-   const uint8_t *src, int linesize, int 
offx, int offy,
+static void compute_ssd_integral_image(uint32_t *ii, ptrdiff_t ii_linesize_32,
+   const uint8_t *src, ptrdiff_t linesize, 
int offx, int offy,
int e, int w, int h)
 {
 // ii has a surrounding padding of thickness "e"
@@ -367,7 +367,7 @@ static int config_input(AVFilterLink *inlink)
 
 struct thread_data {
 const uint8_t *src;
-int src_linesize;
+ptrdiff_t src_linesize;
 int startx, starty;
 int endx, endy;
 const uint32_t *ii_start;
@@ -379,7 +379,7 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 int x, y;
 NLMeansContext *s = ctx->priv;
 const struct thread_data *td = arg;
-const int src_linesize = td->src_linesize;
+const ptrdiff_t src_linesize = td->src_linesize;
 const int process_h = td->endy - td->starty;
 const int slice_start = (process_h *  jobnr   ) / nb_jobs;
 const int slice_end   = (process_h * (jobnr+1)) / nb_jobs;
@@ -403,8 +403,8 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 }
 
 static int nlmeans_plane(AVFilterContext *ctx, int w, int h, int p, int r,
- uint8_t *dst, int dst_linesize,
- const uint8_t *src, int src_linesize)
+ uint8_t *dst, ptrdiff_t dst_linesize,
+ const uint8_t *src, ptrdiff_t src_line

[FFmpeg-devel] [PATCH 8/9] lavfi/nlmeans: move final weighted averaging out of nlmeans_plane

2018-05-06 Thread Clément Bœsch
This helps figuring out where the filter is slow:

  70.53%  ffmpeg_g  ffmpeg_g  [.] nlmeans_slice
  25.73%  ffmpeg_g  ffmpeg_g  [.] compute_safe_ssd_integral_image_c
   1.74%  ffmpeg_g  ffmpeg_g  [.] compute_unsafe_ssd_integral_image
   0.82%  ffmpeg_g  ffmpeg_g  [.] ff_mjpeg_decode_sos
   0.51%  ffmpeg_g  [unknown] [k] 0x91800a80
   0.24%  ffmpeg_g  ffmpeg_g  [.] weight_averages

(Tested with a large image that takes several seconds to process)

Since this function is irrelevant speed wise, the file's TODO is
updated.
---
 libavfilter/vf_nlmeans.c | 33 ++---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index 201e4feb41..abe708a2fc 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -20,7 +20,6 @@
 
 /**
  * @todo
- * - SIMD for final weighted averaging
  * - better automatic defaults? see "Parameters" @ 
http://www.ipol.im/pub/art/2011/bcm_nlm/
  * - temporal support (probably doesn't need any displacement according to
  *   "Denoising image sequences does not require motion estimation")
@@ -411,11 +410,30 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 return 0;
 }
 
+static void weight_averages(uint8_t *dst, ptrdiff_t dst_linesize,
+const uint8_t *src, ptrdiff_t src_linesize,
+struct weighted_avg *wa, ptrdiff_t wa_linesize,
+int w, int h)
+{
+int x, y;
+
+for (y = 0; y < h; y++) {
+for (x = 0; x < w; x++) {
+// Also weight the centered pixel
+wa[x].total_weight += 1.0;
+wa[x].sum += 1.0 * src[x];
+dst[x] = av_clip_uint8(wa[x].sum / wa[x].total_weight);
+}
+dst += dst_linesize;
+src += src_linesize;
+wa += wa_linesize;
+}
+}
+
 static int nlmeans_plane(AVFilterContext *ctx, int w, int h, int p, int r,
  uint8_t *dst, ptrdiff_t dst_linesize,
  const uint8_t *src, ptrdiff_t src_linesize)
 {
-int x, y;
 int offx, offy;
 NLMeansContext *s = ctx->priv;
 /* patches center points cover the whole research window so the patches
@@ -448,17 +466,10 @@ static int nlmeans_plane(AVFilterContext *ctx, int w, int 
h, int p, int r,
 }
 }
 }
-for (y = 0; y < h; y++) {
-for (x = 0; x < w; x++) {
-struct weighted_avg *wa = &s->wa[y*s->wa_linesize + x];
 
-// Also weight the centered pixel
-wa->total_weight += 1.0;
-wa->sum += 1.0 * src[y*src_linesize + x];
+weight_averages(dst, dst_linesize, src, src_linesize,
+s->wa, s->wa_linesize, w, h);
 
-dst[y*dst_linesize + x] = av_clip_uint8(wa->sum / 
wa->total_weight);
-}
-}
 return 0;
 }
 
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 6/9] lavfi/nlmeans: make compute_safe_ssd_integral_image_c faster

2018-05-06 Thread Clément Bœsch
before:  ssd_integral_image_c: 49204.6
after:   ssd_integral_image_c: 44272.8

Unrolling by 4 for made the biggest different on odroid-c2 (aarch64);
unrolling by 2 or 8 both raised 46k cycles vs 44k for 4.

Additionally, this is a much better reference when writing SIMD (SIMD
vectorization will just target 16 instead of 4).
---
 libavfilter/vf_nlmeans.c | 27 +--
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index c30e44498f..f37f1183f7 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -146,10 +146,6 @@ static inline int get_integral_patch_value(const uint32_t 
*ii, int ii_lz_32, int
  * function, we do not need any clipping here.
  *
  * The line above dst and the column to its left are always readable.
- *
- * This C version computes the SSD integral image using a scalar accumulator,
- * while for SIMD implementation it is likely more interesting to use the
- * two-loops algorithm variant.
  */
 static void compute_safe_ssd_integral_image_c(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
   const uint8_t *s1, ptrdiff_t 
linesize1,
@@ -157,21 +153,32 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
*dst, ptrdiff_t dst_lines
   int w, int h)
 {
 int x, y;
+const uint32_t *dst_top = dst - dst_linesize_32;
 
 /* SIMD-friendly assumptions allowed here */
 av_assert2(!(w & 0xf) && w >= 16 && h >= 1);
 
 for (y = 0; y < h; y++) {
-uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
-
-for (x = 0; x < w; x++) {
-const int d  = s1[x] - s2[x];
-acc += d * d;
-dst[x] = dst[-dst_linesize_32 + x] + acc;
+for (x = 0; x < w; x += 4) {
+const int d0 = s1[x] - s2[x];
+const int d1 = s1[x + 1] - s2[x + 1];
+const int d2 = s1[x + 2] - s2[x + 2];
+const int d3 = s1[x + 3] - s2[x + 3];
+
+dst[x] = dst_top[x] - dst_top[x - 1] + d0*d0;
+dst[x + 1] = dst_top[x + 1] - dst_top[x] + d1*d1;
+dst[x + 2] = dst_top[x + 2] - dst_top[x + 1] + d2*d2;
+dst[x + 3] = dst_top[x + 3] - dst_top[x + 2] + d3*d3;
+
+dst[x] += dst[x - 1];
+dst[x + 1] += dst[x];
+dst[x + 2] += dst[x + 1];
+dst[x + 3] += dst[x + 2];
 }
 s1  += linesize1;
 s2  += linesize2;
 dst += dst_linesize_32;
+dst_top += dst_linesize_32;
 }
 }
 
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 9/9] lavfi/nlmeans: reorder memory accesses in get_integral_patch_value

2018-05-06 Thread Clément Bœsch
This doesn't seem to make much of a difference but it can't hurt.
---
 libavfilter/vf_nlmeans.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index abe708a2fc..38c50bc94a 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -131,10 +131,10 @@ static int query_formats(AVFilterContext *ctx)
  */
 static inline int get_integral_patch_value(const uint32_t *ii, int ii_lz_32, 
int x, int y, int p)
 {
-const int e = ii[(y + p) * ii_lz_32 + (x + p)];
-const int d = ii[(y + p) * ii_lz_32 + (x - p - 1)];
-const int b = ii[(y - p - 1) * ii_lz_32 + (x + p)];
 const int a = ii[(y - p - 1) * ii_lz_32 + (x - p - 1)];
+const int b = ii[(y - p - 1) * ii_lz_32 + (x + p)];
+const int d = ii[(y + p) * ii_lz_32 + (x - p - 1)];
+const int e = ii[(y + p) * ii_lz_32 + (x + p)];
 return e - d - b + a;
 }
 
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 7/9] lavfi/nlmeans: switch from double to float

2018-05-06 Thread Clément Bœsch
Overall speed appears to be 1.1x faster with no noticeable quality impact.
---
 libavfilter/vf_nlmeans.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index f37f1183f7..201e4feb41 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -40,8 +40,8 @@
 #include "video.h"
 
 struct weighted_avg {
-double total_weight;
-double sum;
+float total_weight;
+float sum;
 };
 
 #define WEIGHT_LUT_NBITS 9
@@ -63,8 +63,8 @@ typedef struct NLMeansContext {
 ptrdiff_t ii_lz_32; // linesize in 32-bit units of 
the integral image
 struct weighted_avg *wa;// weighted average of every 
pixel
 ptrdiff_t wa_linesize;  // linesize for wa in struct 
size unit
-double weight_lut[WEIGHT_LUT_SIZE]; // lookup table mapping 
(scaled) patch differences to their associated weights
-double pdiff_lut_scale; // scale factor for patch 
differences before looking into the LUT
+float weight_lut[WEIGHT_LUT_SIZE];  // lookup table mapping 
(scaled) patch differences to their associated weights
+float pdiff_lut_scale;  // scale factor for patch 
differences before looking into the LUT
 int max_meaningful_diff;// maximum difference 
considered (if the patch difference is too high we ignore the pixel)
 NLMeansDSPContext dsp;
 } NLMeansContext;
@@ -206,7 +206,7 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
*dst, ptrdiff_t dst_lines
  * @param w width to compute
  * @param h height to compute
  */
-static inline void compute_unsafe_ssd_integral_image(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
+static void compute_unsafe_ssd_integral_image(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
  int startx, int starty,
  const uint8_t *src, 
ptrdiff_t linesize,
  int offx, int offy, int 
r, int sw, int sh,
@@ -402,7 +402,7 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 const int patch_diff_sq = get_integral_patch_value(td->ii_start, 
s->ii_lz_32, x, y, td->p);
 if (patch_diff_sq < s->max_meaningful_diff) {
 const int weight_lut_idx = patch_diff_sq * s->pdiff_lut_scale;
-const double weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
+const float weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
 wa[x].total_weight += weight;
 wa[x].sum += weight * src[x];
 }
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 7/9] lavfi/nlmeans: switch from double to float

2018-05-06 Thread Moritz Barsnick
On Sun, May 06, 2018 at 13:40:58 +0200, Clément Bœsch wrote:
> Overall speed appears to be 1.1x faster with no noticeable quality impact.

Probably platform dependant?

>  struct weighted_avg {
> -double total_weight;
> -double sum;
> +float total_weight;
> +float sum;
>  };

I believe these calculaions in nlmeans_plane() will promote to double
before being cast back to float:

   // Also weight the centered pixel
wa->total_weight += 1.0;
wa->sum += 1.0 * src[y*src_linesize + x];

(At least the second one. The first one - just an assignment of a
constant - is covered by the preprocessor, IIUC.) They need to use
"1.0f".

(There are others, but only in init(), which don't matter for
performance.)

Cheers,
Moritz
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/9] lavfi/nlmeans: add SIMD-friendly assumptions for compute_safe_ssd_integral_image

2018-05-06 Thread Michael Niedermayer
On Sun, May 06, 2018 at 01:40:53PM +0200, Clément Bœsch wrote:
> SIMD code will not have to deal with padding itself. Overwriting in that
> function may have been possible but involve large overreading of the
> sources. Instead, we simply make sure the width to process is always a
> multiple of 16. Additionally, there must be some actual area to process
> so the SIMD code can have its boundary checks after processing the first
> pixels.
> ---
>  libavfilter/vf_nlmeans.c | 25 ++---
>  1 file changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> index d222d3913e..21f981a605 100644
> --- a/libavfilter/vf_nlmeans.c
> +++ b/libavfilter/vf_nlmeans.c
> @@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
> *dst, int dst_linesize_32
>  {
>  int x, y;
>  
> +/* SIMD-friendly assumptions allowed here */
> +av_assert2(!(w & 0xf) && w >= 16 && h >= 1);
> +
>  for (y = 0; y < h; y++) {
>  uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
>  
> @@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, int 
> ii_linesize_32,
>  // to compare the 2 sources pixels
>  const int startx_safe = FFMAX(s1x, s2x);
>  const int starty_safe = FFMAX(s1y, s2y);
> -const int endx_safe   = FFMIN(s1x + w, s2x + w);
> +const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned
>  const int endy_safe   = FFMIN(s1y + h, s2y + h);
>  
> +// deduce the safe area width and height
> +const int safe_pw = (u_endx_safe - startx_safe) & ~0xf;
> +const int safe_ph = endy_safe - starty_safe;
> +
> +// adjusted end x position of the safe area after width of the safe area 
> gets aligned
> +const int endx_safe = startx_safe + safe_pw;
> +
>  // top part where only one of s1 and s2 is still readable, or none at all
>  compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
>0, 0,
> @@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, 
> int ii_linesize_32,
>0, starty_safe,
>src, linesize,
>offx, offy, e, w, h,
> -  startx_safe, endy_safe - starty_safe);
> +  startx_safe, safe_ph);
>  
>  // main and safe part of the integral
>  av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w);
>  av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h);
>  av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w);
>  av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h);
> -compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + 
> startx_safe, ii_linesize_32,
> -  src + (starty_safe - s1y) * linesize + 
> (startx_safe - s1x), linesize,
> -  src + (starty_safe - s2y) * linesize + 
> (startx_safe - s2x), linesize,
> -  endx_safe - startx_safe, endy_safe - 
> starty_safe);
> +if (safe_pw && safe_ph)
> +dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 
> + startx_safe, ii_linesize_32,
> + src + (starty_safe - s1y) * 
> linesize + (startx_safe - s1x), linesize,
> + src + (starty_safe - s2y) * 
> linesize + (startx_safe - s2x), linesize,
> + safe_pw, safe_ph);


i think this is or i am missing some change

libavfilter/vf_nlmeans.c: In function ‘compute_ssd_integral_image’:
libavfilter/vf_nlmeans.c:294:9: error: ‘dsp’ undeclared (first use in this 
function)
 dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 + 
startx_safe, ii_linesize_32,
 ^
libavfilter/vf_nlmeans.c:294:9: note: each undeclared identifier is reported 
only once for each function it appears in
libavfilter/vf_nlmeans.c: At top level:
libavfilter/vf_nlmeans.c:153:13: warning: ‘compute_safe_ssd_integral_image_c’ 
defined but not used [-Wunused-function]
 static void compute_safe_ssd_integral_image_c(uint32_t *dst, int 
dst_linesize_32,
 ^
make: *** [libavfilter/vf_nlmeans.o] Error 1
make: *** Waiting for unfinished jobs

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Everything should be made as simple as possible, but not simpler.
-- Albert Einstein


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/9] lavfi/nlmeans: random code shuffling to help compiler

2018-05-07 Thread Michael Niedermayer
On Sun, May 06, 2018 at 01:40:52PM +0200, Clément Bœsch wrote:
> This makes nlmeans_slice() slightly faster at least on GCC 7.3.
> ---
>  libavfilter/vf_nlmeans.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)

LGTM

thx

[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Modern terrorism, a quick summary: Need oil, start war with country that
has oil, kill hundread thousand in war. Let country fall into chaos,
be surprised about raise of fundamantalists. Drop more bombs, kill more
people, be surprised about them taking revenge and drop even more bombs
and strip your own citizens of their rights and freedoms. to be continued


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/9] lavfi/nlmeans: add SIMD-friendly assumptions for compute_safe_ssd_integral_image

2018-05-07 Thread Clément Bœsch
On Mon, May 07, 2018 at 12:14:37AM +0200, Michael Niedermayer wrote:
> On Sun, May 06, 2018 at 01:40:53PM +0200, Clément Bœsch wrote:
> > SIMD code will not have to deal with padding itself. Overwriting in that
> > function may have been possible but involve large overreading of the
> > sources. Instead, we simply make sure the width to process is always a
> > multiple of 16. Additionally, there must be some actual area to process
> > so the SIMD code can have its boundary checks after processing the first
> > pixels.
> > ---
> >  libavfilter/vf_nlmeans.c | 25 ++---
> >  1 file changed, 18 insertions(+), 7 deletions(-)
> > 
> > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> > index d222d3913e..21f981a605 100644
> > --- a/libavfilter/vf_nlmeans.c
> > +++ b/libavfilter/vf_nlmeans.c
> > @@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
> > *dst, int dst_linesize_32
> >  {
> >  int x, y;
> >  
> > +/* SIMD-friendly assumptions allowed here */
> > +av_assert2(!(w & 0xf) && w >= 16 && h >= 1);
> > +
> >  for (y = 0; y < h; y++) {
> >  uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
> >  
> > @@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, 
> > int ii_linesize_32,
> >  // to compare the 2 sources pixels
> >  const int startx_safe = FFMAX(s1x, s2x);
> >  const int starty_safe = FFMAX(s1y, s2y);
> > -const int endx_safe   = FFMIN(s1x + w, s2x + w);
> > +const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned
> >  const int endy_safe   = FFMIN(s1y + h, s2y + h);
> >  
> > +// deduce the safe area width and height
> > +const int safe_pw = (u_endx_safe - startx_safe) & ~0xf;
> > +const int safe_ph = endy_safe - starty_safe;
> > +
> > +// adjusted end x position of the safe area after width of the safe 
> > area gets aligned
> > +const int endx_safe = startx_safe + safe_pw;
> > +
> >  // top part where only one of s1 and s2 is still readable, or none at 
> > all
> >  compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
> >0, 0,
> > @@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, 
> > int ii_linesize_32,
> >0, starty_safe,
> >src, linesize,
> >offx, offy, e, w, h,
> > -  startx_safe, endy_safe - 
> > starty_safe);
> > +  startx_safe, safe_ph);
> >  
> >  // main and safe part of the integral
> >  av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w);
> >  av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h);
> >  av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w);
> >  av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h);
> > -compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + 
> > startx_safe, ii_linesize_32,
> > -  src + (starty_safe - s1y) * linesize 
> > + (startx_safe - s1x), linesize,
> > -  src + (starty_safe - s2y) * linesize 
> > + (startx_safe - s2x), linesize,
> > -  endx_safe - startx_safe, endy_safe - 
> > starty_safe);
> > +if (safe_pw && safe_ph)
> > +dsp->compute_safe_ssd_integral_image(ii + 
> > starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32,
> > + src + (starty_safe - s1y) * 
> > linesize + (startx_safe - s1x), linesize,
> > + src + (starty_safe - s2y) * 
> > linesize + (startx_safe - s2x), linesize,
> > + safe_pw, safe_ph);
> 
> 
> i think this is or i am missing some change
> 
> libavfilter/vf_nlmeans.c: In function ‘compute_ssd_integral_image’:
> libavfilter/vf_nlmeans.c:294:9: error: ‘dsp’ undeclared (first use in this 
> function)
>  dsp->compute_safe_ssd_integral_image(ii + starty_safe*ii_linesize_32 
> + startx_safe, ii_linesize_32,
>  ^
> libavfilter/vf_nlmeans.c:294:9: note: each undeclared identifier is reported 
> only once for each function it appears in
> libavfilter/vf_nlmeans.c: At top level:
> libavfilter/vf_nlmeans.c:153:13: warning: ‘compute_safe_ssd_integral_image_c’ 
> defined but not used [-Wunused-function]
>  static void compute_safe_ssd_integral_image_c(uint32_t *dst, int 
> dst_linesize_32,
>  ^
> make: *** [libavfilter/vf_nlmeans.o] Error 1
> make: *** Waiting for unfinished jobs

Yeah I made a mistake while splitting commit, this is fixed locally. At
this step it's supposed to still be calling
compute_safe_ssd_integral_image_c() directly (but its last 2 parameters
changed).

-- 
Clément B.


signature.asc
Description: PGP signature
__

Re: [FFmpeg-devel] [PATCH 7/9] lavfi/nlmeans: switch from double to float

2018-05-07 Thread Clément Bœsch
On Sun, May 06, 2018 at 04:53:54PM +0200, Moritz Barsnick wrote:
> On Sun, May 06, 2018 at 13:40:58 +0200, Clément Bœsch wrote:
> > Overall speed appears to be 1.1x faster with no noticeable quality impact.
> 
> Probably platform dependant?
> 
> >  struct weighted_avg {
> > -double total_weight;
> > -double sum;
> > +float total_weight;
> > +float sum;
> >  };
> 
> I believe these calculaions in nlmeans_plane() will promote to double
> before being cast back to float:
> 
>// Also weight the centered pixel
> wa->total_weight += 1.0;
> wa->sum += 1.0 * src[y*src_linesize + x];
> 
> (At least the second one. The first one - just an assignment of a
> constant - is covered by the preprocessor, IIUC.) They need to use
> "1.0f".
> 

It doesn't really matter here actually, in "lavfi/nlmeans: move final
weighted averaging out of nlmeans_plane" you can see that this code
represents 0.24% of the CPU time. I fixed it locally anyway, thanks.

> (There are others, but only in init(), which don't matter for
> performance.)

Yeah, I left these to double on purpose.

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] Misc improvements in nlmeans filter [v2]

2018-05-07 Thread Clément Bœsch
Changes since v1:

- fixed float operation in double as pointed out by Moritz
- fix broken commit split as pointed out by Michael
- added patch 10: "use unsigned for the integral patch"
- misc instruction shuffling in AArch64 SIMD for better performances

I plan to push this soon unless someone wants more time to review.

BTW, x86 SIMD patch welcome, the filter badly needs some performance
improvements. Also, any suggestion on how not to make it spend 80% of
the time in nlmeans_slice() welcome.

Regards,


___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2 01/10] lavfi/nlmeans: random code shuffling to help compiler

2018-05-07 Thread Clément Bœsch
This makes nlmeans_slice() slightly faster at least on GCC 7.3.
---
 libavfilter/vf_nlmeans.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index e4952e187e..d222d3913e 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -368,7 +368,6 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 int x, y;
 NLMeansContext *s = ctx->priv;
 const struct thread_data *td = arg;
-const uint8_t *src = td->src;
 const int src_linesize = td->src_linesize;
 const int process_h = td->endy - td->starty;
 const int slice_start = (process_h *  jobnr   ) / nb_jobs;
@@ -377,14 +376,15 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 const int endy   = td->starty + slice_end;
 
 for (y = starty; y < endy; y++) {
+const uint8_t *src = td->src + y*src_linesize;
+struct weighted_avg *wa = s->wa + y*s->wa_linesize;
 for (x = td->startx; x < td->endx; x++) {
 const int patch_diff_sq = get_integral_patch_value(td->ii_start, 
s->ii_lz_32, x, y, td->p);
 if (patch_diff_sq < s->max_meaningful_diff) {
-struct weighted_avg *wa = &s->wa[y*s->wa_linesize + x];
 const int weight_lut_idx = patch_diff_sq * s->pdiff_lut_scale;
 const double weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
-wa->total_weight += weight;
-wa->sum += weight * src[y*src_linesize + x];
+wa[x].total_weight += weight;
+wa[x].sum += weight * src[x];
 }
 }
 }
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2 03/10] lavfi/nlmeans: use ptrdiff_t for linesizes

2018-05-07 Thread Clément Bœsch
Similarly to previous commit, this will help writing SIMD code by not
having manual zero-extension in SIMD code
---
 libavfilter/vf_nlmeans.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index 3f0a43ee72..b081a4e5af 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -60,9 +60,9 @@ typedef struct NLMeansContext {
 uint32_t *ii_orig;  // integral image
 uint32_t *ii;   // integral image starting 
after the 0-line and 0-column
 int ii_w, ii_h; // width and height of the 
integral image
-int ii_lz_32;   // linesize in 32-bit units of 
the integral image
+ptrdiff_t ii_lz_32; // linesize in 32-bit units of 
the integral image
 struct weighted_avg *wa;// weighted average of every 
pixel
-int wa_linesize;// linesize for wa in struct 
size unit
+ptrdiff_t wa_linesize;  // linesize for wa in struct 
size unit
 double weight_lut[WEIGHT_LUT_SIZE]; // lookup table mapping 
(scaled) patch differences to their associated weights
 double pdiff_lut_scale; // scale factor for patch 
differences before looking into the LUT
 int max_meaningful_diff;// maximum difference 
considered (if the patch difference is too high we ignore the pixel)
@@ -150,9 +150,9 @@ static inline int get_integral_patch_value(const uint32_t 
*ii, int ii_lz_32, int
  * while for SIMD implementation it is likely more interesting to use the
  * two-loops algorithm variant.
  */
-static void compute_safe_ssd_integral_image_c(uint32_t *dst, int 
dst_linesize_32,
-  const uint8_t *s1, int linesize1,
-  const uint8_t *s2, int linesize2,
+static void compute_safe_ssd_integral_image_c(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
+  const uint8_t *s1, ptrdiff_t 
linesize1,
+  const uint8_t *s2, ptrdiff_t 
linesize2,
   int w, int h)
 {
 int x, y;
@@ -198,9 +198,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
*dst, int dst_linesize_32
  * @param w width to compute
  * @param h height to compute
  */
-static inline void compute_unsafe_ssd_integral_image(uint32_t *dst, int 
dst_linesize_32,
+static inline void compute_unsafe_ssd_integral_image(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
  int startx, int starty,
- const uint8_t *src, int 
linesize,
+ const uint8_t *src, 
ptrdiff_t linesize,
  int offx, int offy, int 
r, int sw, int sh,
  int w, int h)
 {
@@ -240,8 +240,8 @@ static inline void 
compute_unsafe_ssd_integral_image(uint32_t *dst, int dst_line
  * @param h source height
  * @param e research padding edge
  */
-static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32,
-   const uint8_t *src, int linesize, int 
offx, int offy,
+static void compute_ssd_integral_image(uint32_t *ii, ptrdiff_t ii_linesize_32,
+   const uint8_t *src, ptrdiff_t linesize, 
int offx, int offy,
int e, int w, int h)
 {
 // ii has a surrounding padding of thickness "e"
@@ -367,7 +367,7 @@ static int config_input(AVFilterLink *inlink)
 
 struct thread_data {
 const uint8_t *src;
-int src_linesize;
+ptrdiff_t src_linesize;
 int startx, starty;
 int endx, endy;
 const uint32_t *ii_start;
@@ -379,7 +379,7 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 int x, y;
 NLMeansContext *s = ctx->priv;
 const struct thread_data *td = arg;
-const int src_linesize = td->src_linesize;
+const ptrdiff_t src_linesize = td->src_linesize;
 const int process_h = td->endy - td->starty;
 const int slice_start = (process_h *  jobnr   ) / nb_jobs;
 const int slice_end   = (process_h * (jobnr+1)) / nb_jobs;
@@ -403,8 +403,8 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 }
 
 static int nlmeans_plane(AVFilterContext *ctx, int w, int h, int p, int r,
- uint8_t *dst, int dst_linesize,
- const uint8_t *src, int src_linesize)
+ uint8_t *dst, ptrdiff_t dst_linesize,
+ const uint8_t *src, ptrdiff_t src_line

[FFmpeg-devel] [PATCH v2 04/10] lavfi/nlmeans: add AArch64 SIMD for compute_safe_ssd_integral_image

2018-05-07 Thread Clément Bœsch
ssd_integral_image_c: 49204.6
ssd_integral_image_neon: 28346.8
---
 libavfilter/aarch64/Makefile  |  3 +
 libavfilter/aarch64/vf_nlmeans_init.c | 33 +++
 libavfilter/aarch64/vf_nlmeans_neon.S | 80 +++
 libavfilter/vf_nlmeans.c  | 26 ++---
 libavfilter/vf_nlmeans.h  | 35 
 5 files changed, 170 insertions(+), 7 deletions(-)
 create mode 100644 libavfilter/aarch64/Makefile
 create mode 100644 libavfilter/aarch64/vf_nlmeans_init.c
 create mode 100644 libavfilter/aarch64/vf_nlmeans_neon.S
 create mode 100644 libavfilter/vf_nlmeans.h

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
new file mode 100644
index 00..b58daa3a3f
--- /dev/null
+++ b/libavfilter/aarch64/Makefile
@@ -0,0 +1,3 @@
+OBJS-$(CONFIG_NLMEANS_FILTER)+= aarch64/vf_nlmeans_init.o
+
+NEON-OBJS-$(CONFIG_NLMEANS_FILTER)   += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/vf_nlmeans_init.c 
b/libavfilter/aarch64/vf_nlmeans_init.c
new file mode 100644
index 00..a1edefb144
--- /dev/null
+++ b/libavfilter/aarch64/vf_nlmeans_init.c
@@ -0,0 +1,33 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/vf_nlmeans.h"
+
+void ff_compute_safe_ssd_integral_image_neon(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
+ const uint8_t *s1, ptrdiff_t 
linesize1,
+ const uint8_t *s2, ptrdiff_t 
linesize2,
+ int w, int h);
+
+av_cold void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp)
+{
+int cpu_flags = av_get_cpu_flags();
+
+if (have_neon(cpu_flags))
+dsp->compute_safe_ssd_integral_image = 
ff_compute_safe_ssd_integral_image_neon;
+}
diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
b/libavfilter/aarch64/vf_nlmeans_neon.S
new file mode 100644
index 00..6308a428db
--- /dev/null
+++ b/libavfilter/aarch64/vf_nlmeans_neon.S
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 Clément Bœsch 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
+.macro acc_sum_store x, xb
+dup v24.4S, v24.4S[3]   // 
...X -> 
+ext v25.16B, v26.16B, \xb, #12  // 
ext(,ABCD,12)=0ABC
+add v24.4S, v24.4S, \x  // 
+ABCD={X+A,X+B,X+C,X+D}
+add v24.4S, v24.4S, v25.4S  // 
{X+A,X+B+A,X+C+B,X+D+C}   (+0ABC)
+ext v25.16B, v26.16B, v25.16B, #12  // 
ext(,0ABC,12)=00AB
+add v24.4S, v24.4S, v25.4S  // 
{X+A,X+B+A,X+C+B+A,X+D+C+B}   (+00AB)
+ext v25.16B, v26.16B, v25.16B, #12  // 
ext(,00AB,12)=000A
+add v24.4S, v24.4S, v25.4S  // 
{X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
+st1 {v24.4S}, [x0], #16 // 
write 4x32-bit final values
+.endm
+
+function ff_compute_safe_ssd_integral_image_neon, export=1
+moviv26.4S, #0  // 
used as zero for the "rotations" in acc_sum_store
+sub x3, x3, w6, UXTW// s1 
padding (s1_linesize - w)

[FFmpeg-devel] [PATCH v2 02/10] lavfi/nlmeans: add SIMD-friendly assumptions for compute_safe_ssd_integral_image

2018-05-07 Thread Clément Bœsch
SIMD code will not have to deal with padding itself. Overwriting in that
function may have been possible but involve large overreading of the
sources. Instead, we simply make sure the width to process is always a
multiple of 16. Additionally, there must be some actual area to process
so the SIMD code can have its boundary checks after processing the first
pixels.
---
 libavfilter/vf_nlmeans.c | 25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index d222d3913e..3f0a43ee72 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
*dst, int dst_linesize_32
 {
 int x, y;
 
+/* SIMD-friendly assumptions allowed here */
+av_assert2(!(w & 0xf) && w >= 16 && h >= 1);
+
 for (y = 0; y < h; y++) {
 uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
 
@@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, int 
ii_linesize_32,
 // to compare the 2 sources pixels
 const int startx_safe = FFMAX(s1x, s2x);
 const int starty_safe = FFMAX(s1y, s2y);
-const int endx_safe   = FFMIN(s1x + w, s2x + w);
+const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned
 const int endy_safe   = FFMIN(s1y + h, s2y + h);
 
+// deduce the safe area width and height
+const int safe_pw = (u_endx_safe - startx_safe) & ~0xf;
+const int safe_ph = endy_safe - starty_safe;
+
+// adjusted end x position of the safe area after width of the safe area 
gets aligned
+const int endx_safe = startx_safe + safe_pw;
+
 // top part where only one of s1 and s2 is still readable, or none at all
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
   0, 0,
@@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, int 
ii_linesize_32,
   0, starty_safe,
   src, linesize,
   offx, offy, e, w, h,
-  startx_safe, endy_safe - starty_safe);
+  startx_safe, safe_ph);
 
 // main and safe part of the integral
 av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w);
 av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h);
 av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w);
 av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h);
-compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + 
startx_safe, ii_linesize_32,
-  src + (starty_safe - s1y) * linesize + 
(startx_safe - s1x), linesize,
-  src + (starty_safe - s2y) * linesize + 
(startx_safe - s2x), linesize,
-  endx_safe - startx_safe, endy_safe - 
starty_safe);
+if (safe_pw && safe_ph)
+compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + 
startx_safe, ii_linesize_32,
+  src + (starty_safe - s1y) * linesize 
+ (startx_safe - s1x), linesize,
+  src + (starty_safe - s2y) * linesize 
+ (startx_safe - s2x), linesize,
+  safe_pw, safe_ph);
 
 // right part of the integral
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
   endx_safe, starty_safe,
   src, linesize,
   offx, offy, e, w, h,
-  ii_w - endx_safe, endy_safe - 
starty_safe);
+  ii_w - endx_safe, safe_ph);
 
 // bottom part where only one of s1 and s2 is still readable, or none at 
all
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2 06/10] lavfi/nlmeans: make compute_safe_ssd_integral_image_c faster

2018-05-07 Thread Clément Bœsch
before:  ssd_integral_image_c: 49204.6
after:   ssd_integral_image_c: 44272.8

Unrolling by 4 for made the biggest different on odroid-c2 (aarch64);
unrolling by 2 or 8 both raised 46k cycles vs 44k for 4.

Additionally, this is a much better reference when writing SIMD (SIMD
vectorization will just target 16 instead of 4).
---
 libavfilter/vf_nlmeans.c | 27 +--
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index c30e44498f..f37f1183f7 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -146,10 +146,6 @@ static inline int get_integral_patch_value(const uint32_t 
*ii, int ii_lz_32, int
  * function, we do not need any clipping here.
  *
  * The line above dst and the column to its left are always readable.
- *
- * This C version computes the SSD integral image using a scalar accumulator,
- * while for SIMD implementation it is likely more interesting to use the
- * two-loops algorithm variant.
  */
 static void compute_safe_ssd_integral_image_c(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
   const uint8_t *s1, ptrdiff_t 
linesize1,
@@ -157,21 +153,32 @@ static void compute_safe_ssd_integral_image_c(uint32_t 
*dst, ptrdiff_t dst_lines
   int w, int h)
 {
 int x, y;
+const uint32_t *dst_top = dst - dst_linesize_32;
 
 /* SIMD-friendly assumptions allowed here */
 av_assert2(!(w & 0xf) && w >= 16 && h >= 1);
 
 for (y = 0; y < h; y++) {
-uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
-
-for (x = 0; x < w; x++) {
-const int d  = s1[x] - s2[x];
-acc += d * d;
-dst[x] = dst[-dst_linesize_32 + x] + acc;
+for (x = 0; x < w; x += 4) {
+const int d0 = s1[x] - s2[x];
+const int d1 = s1[x + 1] - s2[x + 1];
+const int d2 = s1[x + 2] - s2[x + 2];
+const int d3 = s1[x + 3] - s2[x + 3];
+
+dst[x] = dst_top[x] - dst_top[x - 1] + d0*d0;
+dst[x + 1] = dst_top[x + 1] - dst_top[x] + d1*d1;
+dst[x + 2] = dst_top[x + 2] - dst_top[x + 1] + d2*d2;
+dst[x + 3] = dst_top[x + 3] - dst_top[x + 2] + d3*d3;
+
+dst[x] += dst[x - 1];
+dst[x + 1] += dst[x];
+dst[x + 2] += dst[x + 1];
+dst[x + 3] += dst[x + 2];
 }
 s1  += linesize1;
 s2  += linesize2;
 dst += dst_linesize_32;
+dst_top += dst_linesize_32;
 }
 }
 
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2 07/10] lavfi/nlmeans: switch from double to float

2018-05-07 Thread Clément Bœsch
Overall speed appears to be 1.1x faster with no noticeable quality
impact.
---
 libavfilter/vf_nlmeans.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index f37f1183f7..aba587f46b 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -40,8 +40,8 @@
 #include "video.h"
 
 struct weighted_avg {
-double total_weight;
-double sum;
+float total_weight;
+float sum;
 };
 
 #define WEIGHT_LUT_NBITS 9
@@ -63,8 +63,8 @@ typedef struct NLMeansContext {
 ptrdiff_t ii_lz_32; // linesize in 32-bit units of 
the integral image
 struct weighted_avg *wa;// weighted average of every 
pixel
 ptrdiff_t wa_linesize;  // linesize for wa in struct 
size unit
-double weight_lut[WEIGHT_LUT_SIZE]; // lookup table mapping 
(scaled) patch differences to their associated weights
-double pdiff_lut_scale; // scale factor for patch 
differences before looking into the LUT
+float weight_lut[WEIGHT_LUT_SIZE];  // lookup table mapping 
(scaled) patch differences to their associated weights
+float pdiff_lut_scale;  // scale factor for patch 
differences before looking into the LUT
 int max_meaningful_diff;// maximum difference 
considered (if the patch difference is too high we ignore the pixel)
 NLMeansDSPContext dsp;
 } NLMeansContext;
@@ -402,7 +402,7 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 const int patch_diff_sq = get_integral_patch_value(td->ii_start, 
s->ii_lz_32, x, y, td->p);
 if (patch_diff_sq < s->max_meaningful_diff) {
 const int weight_lut_idx = patch_diff_sq * s->pdiff_lut_scale;
-const double weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
+const float weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
 wa[x].total_weight += weight;
 wa[x].sum += weight * src[x];
 }
@@ -453,8 +453,8 @@ static int nlmeans_plane(AVFilterContext *ctx, int w, int 
h, int p, int r,
 struct weighted_avg *wa = &s->wa[y*s->wa_linesize + x];
 
 // Also weight the centered pixel
-wa->total_weight += 1.0;
-wa->sum += 1.0 * src[y*src_linesize + x];
+wa->total_weight += 1.f;
+wa->sum += 1.f * src[y*src_linesize + x];
 
 dst[y*dst_linesize + x] = av_clip_uint8(wa->sum / 
wa->total_weight);
 }
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2 10/10] lavfi/nlmeans: use unsigned for the integral patch value

2018-05-07 Thread Clément Bœsch
This value can not be negative.
---
 libavfilter/vf_nlmeans.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index 22d26a12e3..547cb80acd 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -64,7 +64,7 @@ typedef struct NLMeansContext {
 ptrdiff_t wa_linesize;  // linesize for wa in struct 
size unit
 float weight_lut[WEIGHT_LUT_SIZE];  // lookup table mapping 
(scaled) patch differences to their associated weights
 float pdiff_lut_scale;  // scale factor for patch 
differences before looking into the LUT
-int max_meaningful_diff;// maximum difference 
considered (if the patch difference is too high we ignore the pixel)
+uint32_t max_meaningful_diff;   // maximum difference 
considered (if the patch difference is too high we ignore the pixel)
 NLMeansDSPContext dsp;
 } NLMeansContext;
 
@@ -129,12 +129,12 @@ static int query_formats(AVFilterContext *ctx)
  * contains the sum of the squared difference of every corresponding pixels of
  * two input planes of the same size as M.
  */
-static inline int get_integral_patch_value(const uint32_t *ii, int ii_lz_32, 
int x, int y, int p)
+static inline uint32_t get_integral_patch_value(const uint32_t *ii, int 
ii_lz_32, int x, int y, int p)
 {
-const int a = ii[(y - p - 1) * ii_lz_32 + (x - p - 1)];
-const int b = ii[(y - p - 1) * ii_lz_32 + (x + p)];
-const int d = ii[(y + p) * ii_lz_32 + (x - p - 1)];
-const int e = ii[(y + p) * ii_lz_32 + (x + p)];
+const uint32_t a = ii[(y - p - 1) * ii_lz_32 + (x - p - 1)];
+const uint32_t b = ii[(y - p - 1) * ii_lz_32 + (x + p)];
+const uint32_t d = ii[(y + p) * ii_lz_32 + (x - p - 1)];
+const uint32_t e = ii[(y + p) * ii_lz_32 + (x + p)];
 return e - d - b + a;
 }
 
@@ -398,9 +398,9 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 const uint8_t *src = td->src + y*src_linesize;
 struct weighted_avg *wa = s->wa + y*s->wa_linesize;
 for (x = td->startx; x < td->endx; x++) {
-const int patch_diff_sq = get_integral_patch_value(td->ii_start, 
s->ii_lz_32, x, y, td->p);
+const uint32_t patch_diff_sq = 
get_integral_patch_value(td->ii_start, s->ii_lz_32, x, y, td->p);
 if (patch_diff_sq < s->max_meaningful_diff) {
-const int weight_lut_idx = patch_diff_sq * s->pdiff_lut_scale;
+const unsigned weight_lut_idx = patch_diff_sq * 
s->pdiff_lut_scale;
 const float weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
 wa[x].total_weight += weight;
 wa[x].sum += weight * src[x];
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2 08/10] lavfi/nlmeans: move final weighted averaging out of nlmeans_plane

2018-05-07 Thread Clément Bœsch
This helps figuring out where the filter is slow:

  70.53%  ffmpeg_g  ffmpeg_g  [.] nlmeans_slice
  25.73%  ffmpeg_g  ffmpeg_g  [.] compute_safe_ssd_integral_image_c
   1.74%  ffmpeg_g  ffmpeg_g  [.] compute_unsafe_ssd_integral_image
   0.82%  ffmpeg_g  ffmpeg_g  [.] ff_mjpeg_decode_sos
   0.51%  ffmpeg_g  [unknown] [k] 0x91800a80
   0.24%  ffmpeg_g  ffmpeg_g  [.] weight_averages

(Tested with a large image that takes several seconds to process)

Since this function is irrelevant speed wise, the file's TODO is
updated.
---
 libavfilter/vf_nlmeans.c | 33 ++---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index aba587f46b..72a75a6e7a 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -20,7 +20,6 @@
 
 /**
  * @todo
- * - SIMD for final weighted averaging
  * - better automatic defaults? see "Parameters" @ 
http://www.ipol.im/pub/art/2011/bcm_nlm/
  * - temporal support (probably doesn't need any displacement according to
  *   "Denoising image sequences does not require motion estimation")
@@ -411,11 +410,30 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 return 0;
 }
 
+static void weight_averages(uint8_t *dst, ptrdiff_t dst_linesize,
+const uint8_t *src, ptrdiff_t src_linesize,
+struct weighted_avg *wa, ptrdiff_t wa_linesize,
+int w, int h)
+{
+int x, y;
+
+for (y = 0; y < h; y++) {
+for (x = 0; x < w; x++) {
+// Also weight the centered pixel
+wa[x].total_weight += 1.f;
+wa[x].sum += 1.f * src[x];
+dst[x] = av_clip_uint8(wa[x].sum / wa[x].total_weight);
+}
+dst += dst_linesize;
+src += src_linesize;
+wa += wa_linesize;
+}
+}
+
 static int nlmeans_plane(AVFilterContext *ctx, int w, int h, int p, int r,
  uint8_t *dst, ptrdiff_t dst_linesize,
  const uint8_t *src, ptrdiff_t src_linesize)
 {
-int x, y;
 int offx, offy;
 NLMeansContext *s = ctx->priv;
 /* patches center points cover the whole research window so the patches
@@ -448,17 +466,10 @@ static int nlmeans_plane(AVFilterContext *ctx, int w, int 
h, int p, int r,
 }
 }
 }
-for (y = 0; y < h; y++) {
-for (x = 0; x < w; x++) {
-struct weighted_avg *wa = &s->wa[y*s->wa_linesize + x];
 
-// Also weight the centered pixel
-wa->total_weight += 1.f;
-wa->sum += 1.f * src[y*src_linesize + x];
+weight_averages(dst, dst_linesize, src, src_linesize,
+s->wa, s->wa_linesize, w, h);
 
-dst[y*dst_linesize + x] = av_clip_uint8(wa->sum / 
wa->total_weight);
-}
-}
 return 0;
 }
 
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2 09/10] lavfi/nlmeans: reorder memory accesses in get_integral_patch_value

2018-05-07 Thread Clément Bœsch
This doesn't seem to make much of a difference but it can't hurt.
---
 libavfilter/vf_nlmeans.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index 72a75a6e7a..22d26a12e3 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -131,10 +131,10 @@ static int query_formats(AVFilterContext *ctx)
  */
 static inline int get_integral_patch_value(const uint32_t *ii, int ii_lz_32, 
int x, int y, int p)
 {
-const int e = ii[(y + p) * ii_lz_32 + (x + p)];
-const int d = ii[(y + p) * ii_lz_32 + (x - p - 1)];
-const int b = ii[(y - p - 1) * ii_lz_32 + (x + p)];
 const int a = ii[(y - p - 1) * ii_lz_32 + (x - p - 1)];
+const int b = ii[(y - p - 1) * ii_lz_32 + (x + p)];
+const int d = ii[(y + p) * ii_lz_32 + (x - p - 1)];
+const int e = ii[(y + p) * ii_lz_32 + (x + p)];
 return e - d - b + a;
 }
 
-- 
2.17.0

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] Misc improvements in nlmeans filter [v2]

2018-05-07 Thread Paul B Mahol
On 5/7/18, Clement Boesch  wrote:
> Changes since v1:
>
> - fixed float operation in double as pointed out by Moritz
> - fix broken commit split as pointed out by Michael
> - added patch 10: "use unsigned for the integral patch"
> - misc instruction shuffling in AArch64 SIMD for better performances
>
> I plan to push this soon unless someone wants more time to review.
>
> BTW, x86 SIMD patch welcome, the filter badly needs some performance
> improvements. Also, any suggestion on how not to make it spend 80% of
> the time in nlmeans_slice() welcome.
>
> Regards,

LGTM
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2 04/10] lavfi/nlmeans: add AArch64 SIMD for compute_safe_ssd_integral_image

2018-05-07 Thread Michael Niedermayer
On Mon, May 07, 2018 at 07:24:16PM +0200, Clément Bœsch wrote:
> ssd_integral_image_c: 49204.6
> ssd_integral_image_neon: 28346.8
> ---
>  libavfilter/aarch64/Makefile  |  3 +
>  libavfilter/aarch64/vf_nlmeans_init.c | 33 +++
>  libavfilter/aarch64/vf_nlmeans_neon.S | 80 +++
>  libavfilter/vf_nlmeans.c  | 26 ++---
>  libavfilter/vf_nlmeans.h  | 35 
>  5 files changed, 170 insertions(+), 7 deletions(-)
>  create mode 100644 libavfilter/aarch64/Makefile
>  create mode 100644 libavfilter/aarch64/vf_nlmeans_init.c
>  create mode 100644 libavfilter/aarch64/vf_nlmeans_neon.S
>  create mode 100644 libavfilter/vf_nlmeans.h

seems to break make testprogs unless iam missing something

CC  libavfilter/tests/integral.o
libavfilter/tests/integral.c: In function ‘main’:
libavfilter/tests/integral.c:68:40: warning: passing argument 1 of 
‘compute_ssd_integral_image’ from incompatible pointer type [enabled by default]
src, lz, xoff, yoff, e, w, h);
^
In file included from libavfilter/tests/integral.c:19:0:
./libavfilter/vf_nlmeans.c:244:13: note: expected ‘const struct 
NLMeansDSPContext *’ but argument is of type ‘uint32_t *’
 static void compute_ssd_integral_image(const NLMeansDSPContext *dsp,
 ^
libavfilter/tests/integral.c:68:40: warning: passing argument 2 of 
‘compute_ssd_integral_image’ makes pointer from integer without a cast [enabled 
by default]
src, lz, xoff, yoff, e, w, h);
^
In file included from libavfilter/tests/integral.c:19:0:
./libavfilter/vf_nlmeans.c:244:13: note: expected ‘uint32_t *’ but argument is 
of type ‘int’
 static void compute_ssd_integral_image(const NLMeansDSPContext *dsp,
 ^
libavfilter/tests/integral.c:68:40: warning: passing argument 3 of 
‘compute_ssd_integral_image’ makes integer from pointer without a cast [enabled 
by default]
src, lz, xoff, yoff, e, w, h);
^
In file included from libavfilter/tests/integral.c:19:0:
./libavfilter/vf_nlmeans.c:244:13: note: expected ‘ptrdiff_t’ but argument is 
of type ‘const uint8_t *’
 static void compute_ssd_integral_image(const NLMeansDSPContext *dsp,
 ^
libavfilter/tests/integral.c:68:40: warning: passing argument 4 of 
‘compute_ssd_integral_image’ makes pointer from integer without a cast [enabled 
by default]
src, lz, xoff, yoff, e, w, h);
^
In file included from libavfilter/tests/integral.c:19:0:
./libavfilter/vf_nlmeans.c:244:13: note: expected ‘const uint8_t *’ but 
argument is of type ‘int’
 static void compute_ssd_integral_image(const NLMeansDSPContext *dsp,
 ^
libavfilter/tests/integral.c:68:40: error: too few arguments to function 
‘compute_ssd_integral_image’
src, lz, xoff, yoff, e, w, h);
^
In file included from libavfilter/tests/integral.c:19:0:
./libavfilter/vf_nlmeans.c:244:13: note: declared here
 static void compute_ssd_integral_image(const NLMeansDSPContext *dsp,
 ^
make: *** [libavfilter/tests/integral.o] Error 1
make: Target `testprogs' not remade because of errors.

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

If you drop bombs on a foreign country and kill a hundred thousand
innocent people, expect your government to call the consequence
"unprovoked inhuman terrorist attacks" and use it to justify dropping
more bombs and killing more people. The technology changed, the idea is old.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2 04/10] lavfi/nlmeans: add AArch64 SIMD for compute_safe_ssd_integral_image

2018-05-08 Thread Clément Bœsch
On Tue, May 08, 2018 at 04:06:26AM +0200, Michael Niedermayer wrote:
> On Mon, May 07, 2018 at 07:24:16PM +0200, Clément Bœsch wrote:
> > ssd_integral_image_c: 49204.6
> > ssd_integral_image_neon: 28346.8
> > ---
> >  libavfilter/aarch64/Makefile  |  3 +
> >  libavfilter/aarch64/vf_nlmeans_init.c | 33 +++
> >  libavfilter/aarch64/vf_nlmeans_neon.S | 80 +++
> >  libavfilter/vf_nlmeans.c  | 26 ++---
> >  libavfilter/vf_nlmeans.h  | 35 
> >  5 files changed, 170 insertions(+), 7 deletions(-)
> >  create mode 100644 libavfilter/aarch64/Makefile
> >  create mode 100644 libavfilter/aarch64/vf_nlmeans_init.c
> >  create mode 100644 libavfilter/aarch64/vf_nlmeans_neon.S
> >  create mode 100644 libavfilter/vf_nlmeans.h
> 
> seems to break make testprogs unless iam missing something
> 

oups, I forgot I added such a test. It's a bit redundant with the checkasm
test and now mostly useless due to the recently introduced 16B padding
constraint, but I fixed it anyway locally.

[...]

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] Misc improvements in nlmeans filter [v2]

2018-05-08 Thread Clément Bœsch
On Mon, May 07, 2018 at 07:32:55PM +0200, Paul B Mahol wrote:
> On 5/7/18, Clement Boesch  wrote:
> > Changes since v1:
> >
> > - fixed float operation in double as pointed out by Moritz
> > - fix broken commit split as pointed out by Michael
> > - added patch 10: "use unsigned for the integral patch"
> > - misc instruction shuffling in AArch64 SIMD for better performances
> >
> > I plan to push this soon unless someone wants more time to review.
> >
> > BTW, x86 SIMD patch welcome, the filter badly needs some performance
> > improvements. Also, any suggestion on how not to make it spend 80% of
> > the time in nlmeans_slice() welcome.
> >
> > Regards,
> 
> LGTM

patchset applied

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang

2018-07-26 Thread Jan Ekström
Clang is more strict about some things.
---
 libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
b/libavfilter/aarch64/vf_nlmeans_neon.S
index 6308a428db..ac16157bbd 100644
--- a/libavfilter/aarch64/vf_nlmeans_neon.S
+++ b/libavfilter/aarch64/vf_nlmeans_neon.S
@@ -22,7 +22,7 @@
 
 // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
 .macro acc_sum_store x, xb
-dup v24.4S, v24.4S[3]   // 
...X -> 
+dup v24.4s, v24.s[3]// 
...X -> 
 ext v25.16B, v26.16B, \xb, #12  // 
ext(,ABCD,12)=0ABC
 add v24.4S, v24.4S, \x  // 
+ABCD={X+A,X+B,X+C,X+D}
 add v24.4S, v24.4S, v25.4S  // 
{X+A,X+B+A,X+C+B,X+D+C}   (+0ABC)
@@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
 moviv26.4S, #0  // 
used as zero for the "rotations" in acc_sum_store
 sub x3, x3, w6, UXTW// s1 
padding (s1_linesize - w)
 sub x5, x5, w6, UXTW// s2 
padding (s2_linesize - w)
-sub x9, x0, x1, UXTW #2 // 
dst_top
+sub x9, x0, w1, UXTW #2 // 
dst_top
 sub x1, x1, w6, UXTW// dst 
padding (dst_linesize_32 - w)
 lsl x1, x1, #2  // dst 
padding expressed in bytes
 1:  mov w10, w6 // 
width copy for each line
-- 
2.17.1

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang

2018-07-27 Thread Clément Bœsch
On Fri, Jul 27, 2018 at 12:03:46AM +0300, Jan Ekström wrote:
> Clang is more strict about some things.
> ---
>  libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
> b/libavfilter/aarch64/vf_nlmeans_neon.S
> index 6308a428db..ac16157bbd 100644
> --- a/libavfilter/aarch64/vf_nlmeans_neon.S
> +++ b/libavfilter/aarch64/vf_nlmeans_neon.S
> @@ -22,7 +22,7 @@
>  
>  // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
>  .macro acc_sum_store x, xb
> -dup v24.4S, v24.4S[3]   // 
> ...X -> 
> +dup v24.4s, v24.s[3]// 
> ...X -> 

can you keep the capitalized form?

>  ext v25.16B, v26.16B, \xb, #12  // 
> ext(,ABCD,12)=0ABC
>  add v24.4S, v24.4S, \x  // 
> +ABCD={X+A,X+B,X+C,X+D}
>  add v24.4S, v24.4S, v25.4S  // 
> {X+A,X+B+A,X+C+B,X+D+C}   (+0ABC)
> @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
>  moviv26.4S, #0  // 
> used as zero for the "rotations" in acc_sum_store
>  sub x3, x3, w6, UXTW// 
> s1 padding (s1_linesize - w)
>  sub x5, x5, w6, UXTW// 
> s2 padding (s2_linesize - w)
> -sub x9, x0, x1, UXTW #2 // 
> dst_top
> +sub x9, x0, w1, UXTW #2 // 
> dst_top
>  sub x1, x1, w6, UXTW// 
> dst padding (dst_linesize_32 - w)
>  lsl x1, x1, #2  // 
> dst padding expressed in bytes
>  1:  mov w10, w6 // 
> width copy for each line

LGTM otherwise, thx

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang

2018-07-28 Thread Jan Ekström
On Fri, Jul 27, 2018 at 11:14 AM, Clément Bœsch  wrote:
> On Fri, Jul 27, 2018 at 12:03:46AM +0300, Jan Ekström wrote:
>> Clang is more strict about some things.
>> ---
>>  libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
>> b/libavfilter/aarch64/vf_nlmeans_neon.S
>> index 6308a428db..ac16157bbd 100644
>> --- a/libavfilter/aarch64/vf_nlmeans_neon.S
>> +++ b/libavfilter/aarch64/vf_nlmeans_neon.S
>> @@ -22,7 +22,7 @@
>>
>>  // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
>>  .macro acc_sum_store x, xb
>> -dup v24.4S, v24.4S[3]   // 
>> ...X -> 
>> +dup v24.4s, v24.s[3]// 
>> ...X -> 
>
> can you keep the capitalized form?
>
>>  ext v25.16B, v26.16B, \xb, #12  // 
>> ext(,ABCD,12)=0ABC
>>  add v24.4S, v24.4S, \x  // 
>> +ABCD={X+A,X+B,X+C,X+D}
>>  add v24.4S, v24.4S, v25.4S  // 
>> {X+A,X+B+A,X+C+B,X+D+C}   (+0ABC)
>> @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
>>  moviv26.4S, #0  // 
>> used as zero for the "rotations" in acc_sum_store
>>  sub x3, x3, w6, UXTW// 
>> s1 padding (s1_linesize - w)
>>  sub x5, x5, w6, UXTW// 
>> s2 padding (s2_linesize - w)
>> -sub x9, x0, x1, UXTW #2 // 
>> dst_top
>> +sub x9, x0, w1, UXTW #2 // 
>> dst_top
>>  sub x1, x1, w6, UXTW// 
>> dst padding (dst_linesize_32 - w)
>>  lsl x1, x1, #2  // 
>> dst padding expressed in bytes
>>  1:  mov w10, w6 // 
>> width copy for each line
>
> LGTM otherwise, thx
>

Fixed the capitalization with the s suffix and used "fix" instead of
"fixup" in the commit message due to IRC review.

Pushed, and default compilation configuration should now be fixed
again with aarch64+clang.

Jan
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-01-31 Thread Jun Zhao
Remove the pdiff_lut_scale in nlmeans, when search the weight_luttable
in nlmeans_slices(), the old way need to the float-point arithmetic
using pdiff_lut_scale. This change will avoid using pdiff_lut_scale
in the weight_lut table search, it's will improve the performance about
12%. (1080P size picture).

Use the profiling cmd like:
perf stat -a -d -r 5 ./ffmpeg -i input -an -vf nlmeans=s=30 -vframes 10 \
-f null /dev/null

without this change:
when s=1.0(default value) 63s
 s=30.0   72s
after this change:
 s=1.0(default value) 56s
 s=30.0   63s

Signed-off-by: Jun Zhao 
---
 libavfilter/vf_nlmeans.c |   12 
 1 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index 82e779c..72eb819 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -43,8 +43,7 @@ struct weighted_avg {
 float sum;
 };
 
-#define WEIGHT_LUT_NBITS 9
-#define WEIGHT_LUT_SIZE  (1<  300 * 300 * log(255)
 
 typedef struct NLMeansContext {
 const AVClass *class;
@@ -63,7 +62,6 @@ typedef struct NLMeansContext {
 struct weighted_avg *wa;// weighted average of every 
pixel
 ptrdiff_t wa_linesize;  // linesize for wa in struct 
size unit
 float weight_lut[WEIGHT_LUT_SIZE];  // lookup table mapping 
(scaled) patch differences to their associated weights
-float pdiff_lut_scale;  // scale factor for patch 
differences before looking into the LUT
 uint32_t max_meaningful_diff;   // maximum difference 
considered (if the patch difference is too high we ignore the pixel)
 NLMeansDSPContext dsp;
 } NLMeansContext;
@@ -401,8 +399,7 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 const uint32_t patch_diff_sq = e - d - b + a;
 
 if (patch_diff_sq < s->max_meaningful_diff) {
-const unsigned weight_lut_idx = patch_diff_sq * 
s->pdiff_lut_scale;
-const float weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
+const float weight = s->weight_lut[patch_diff_sq]; // 
exp(-patch_diff_sq * s->pdiff_scale)
 wa[x].total_weight += weight;
 wa[x].sum += weight * src[x];
 }
@@ -527,10 +524,9 @@ static av_cold int init(AVFilterContext *ctx)
 
 s->pdiff_scale = 1. / (h * h);
 s->max_meaningful_diff = -log(1/255.) / s->pdiff_scale;
-s->pdiff_lut_scale = 1./s->max_meaningful_diff * WEIGHT_LUT_SIZE;
-av_assert0((s->max_meaningful_diff - 1) * s->pdiff_lut_scale < 
FF_ARRAY_ELEMS(s->weight_lut));
+av_assert0((s->max_meaningful_diff - 1) < FF_ARRAY_ELEMS(s->weight_lut));
 for (i = 0; i < WEIGHT_LUT_SIZE; i++)
-s->weight_lut[i] = exp(-i / s->pdiff_lut_scale * s->pdiff_scale);
+s->weight_lut[i] = exp(-i * s->pdiff_scale);
 
 CHECK_ODD_FIELD(research_size,   "Luma research window");
 CHECK_ODD_FIELD(patch_size,  "Luma patch");
-- 
1.7.1

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-01-31 Thread Carl Eugen Hoyos
2019-01-31 14:55 GMT+01:00, Jun Zhao :
> Remove the pdiff_lut_scale in nlmeans, when search the weight_luttable
> in nlmeans_slices(), the old way need to the float-point arithmetic
> using pdiff_lut_scale. This change will avoid using pdiff_lut_scale
> in the weight_lut table search, it's will improve the performance about
> 12%. (1080P size picture).

Please mention the change in heap memory requirement with numbers.
Remove "old way need to the float-point arithmetic" because new way
also needs (some) floating-point arithmetic.
Feel free not to mention pdiff_lut_scale in the commit message.

Carl Eugen
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-01-31 Thread [email protected]
On Fri, Feb 1, 2019 at 3:57 AM Carl Eugen Hoyos  wrote:
>
> 2019-01-31 14:55 GMT+01:00, Jun Zhao :
> > Remove the pdiff_lut_scale in nlmeans, when search the weight_luttable
> > in nlmeans_slices(), the old way need to the float-point arithmetic
> > using pdiff_lut_scale. This change will avoid using pdiff_lut_scale
> > in the weight_lut table search, it's will improve the performance about
> > 12%. (1080P size picture).
>
> Please mention the change in heap memory requirement with numbers.
> Remove "old way need to the float-point arithmetic" because new way
> also needs (some) floating-point arithmetic.
> Feel free not to mention pdiff_lut_scale in the commit message.
>
Will update the commit message again, Tks
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH V2] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-01-31 Thread Jun Zhao
Remove the pdiff_lut_scale in nlmeans and increase weight_lut table size
from 2^9 to 80, this change will avoid using pdiff_lut_scale in
nlmeans_slice() for weight_lut table search, it's will improve the
performance about 12%. (in 1080P size picture case).

Use the profiling command like:

perf stat -a -d -r 5 ./ffmpeg -i input -an -vf nlmeans=s=30 -vframes 10 \
-f null /dev/null

without this change:
when s=1.0(default value) 63s
 s=30.0   72s

after this change:
 s=1.0(default value) 56s
 s=30.0   63s

Reviewed-by: Carl Eugen Hoyos 
Signed-off-by: Jun Zhao 
---
 libavfilter/vf_nlmeans.c |   12 
 1 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index 82e779c..72eb819 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -43,8 +43,7 @@ struct weighted_avg {
 float sum;
 };
 
-#define WEIGHT_LUT_NBITS 9
-#define WEIGHT_LUT_SIZE  (1<  300 * 300 * log(255)
 
 typedef struct NLMeansContext {
 const AVClass *class;
@@ -63,7 +62,6 @@ typedef struct NLMeansContext {
 struct weighted_avg *wa;// weighted average of every 
pixel
 ptrdiff_t wa_linesize;  // linesize for wa in struct 
size unit
 float weight_lut[WEIGHT_LUT_SIZE];  // lookup table mapping 
(scaled) patch differences to their associated weights
-float pdiff_lut_scale;  // scale factor for patch 
differences before looking into the LUT
 uint32_t max_meaningful_diff;   // maximum difference 
considered (if the patch difference is too high we ignore the pixel)
 NLMeansDSPContext dsp;
 } NLMeansContext;
@@ -401,8 +399,7 @@ static int nlmeans_slice(AVFilterContext *ctx, void *arg, 
int jobnr, int nb_jobs
 const uint32_t patch_diff_sq = e - d - b + a;
 
 if (patch_diff_sq < s->max_meaningful_diff) {
-const unsigned weight_lut_idx = patch_diff_sq * 
s->pdiff_lut_scale;
-const float weight = s->weight_lut[weight_lut_idx]; // 
exp(-patch_diff_sq * s->pdiff_scale)
+const float weight = s->weight_lut[patch_diff_sq]; // 
exp(-patch_diff_sq * s->pdiff_scale)
 wa[x].total_weight += weight;
 wa[x].sum += weight * src[x];
 }
@@ -527,10 +524,9 @@ static av_cold int init(AVFilterContext *ctx)
 
 s->pdiff_scale = 1. / (h * h);
 s->max_meaningful_diff = -log(1/255.) / s->pdiff_scale;
-s->pdiff_lut_scale = 1./s->max_meaningful_diff * WEIGHT_LUT_SIZE;
-av_assert0((s->max_meaningful_diff - 1) * s->pdiff_lut_scale < 
FF_ARRAY_ELEMS(s->weight_lut));
+av_assert0((s->max_meaningful_diff - 1) < FF_ARRAY_ELEMS(s->weight_lut));
 for (i = 0; i < WEIGHT_LUT_SIZE; i++)
-s->weight_lut[i] = exp(-i / s->pdiff_lut_scale * s->pdiff_scale);
+s->weight_lut[i] = exp(-i * s->pdiff_scale);
 
 CHECK_ODD_FIELD(research_size,   "Luma research window");
 CHECK_ODD_FIELD(patch_size,  "Luma patch");
-- 
1.7.1

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] doc/filters: document ranges and defaults for nlmeans options

2019-01-31 Thread Jun Zhao
document ranges and defaults for nlmeans options

Signed-off-by: Jun Zhao 
---
 doc/filters.texi |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/filters.texi b/doc/filters.texi
index fc98323..d588315 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -12296,10 +12296,10 @@ The filter accepts the following options.
 
 @table @option
 @item s
-Set denoising strength.
+Set denoising strength. Default is 1.0. Must be in range [1.0, 30.0].
 
 @item p
-Set patch size.
+Set patch size. Default is 7. Must be odd number in range [0, 99].
 
 @item pc
 Same as @option{p} but for chroma planes.
@@ -12307,7 +12307,7 @@ Same as @option{p} but for chroma planes.
 The default value is @var{0} and means automatic.
 
 @item r
-Set research size.
+Set research size. Default is 15. Must be odd number in range [0, 99].
 
 @item rc
 Same as @option{r} but for chroma planes.
-- 
1.7.1

___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V2] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-02-01 Thread Clément Bœsch
On Fri, Feb 01, 2019 at 10:45:24AM +0800, Jun Zhao wrote:
> Remove the pdiff_lut_scale in nlmeans and increase weight_lut table size
> from 2^9 to 80, this change will avoid using pdiff_lut_scale in
> nlmeans_slice() for weight_lut table search, it's will improve the
> performance about 12%. (in 1080P size picture case).
> 
> Use the profiling command like:
> 
> perf stat -a -d -r 5 ./ffmpeg -i input -an -vf nlmeans=s=30 -vframes 10 \
> -f null /dev/null
> 
> without this change:
> when s=1.0(default value) 63s
>  s=30.0   72s
> 
> after this change:
>  s=1.0(default value) 56s
>  s=30.0   63s

Nice.

I assume this is tested on x86_64?

> 
> Reviewed-by: Carl Eugen Hoyos 
> Signed-off-by: Jun Zhao 
> ---
>  libavfilter/vf_nlmeans.c |   12 
>  1 files changed, 4 insertions(+), 8 deletions(-)
> 
> diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> index 82e779c..72eb819 100644
> --- a/libavfilter/vf_nlmeans.c
> +++ b/libavfilter/vf_nlmeans.c
> @@ -43,8 +43,7 @@ struct weighted_avg {
>  float sum;
>  };
>  
> -#define WEIGHT_LUT_NBITS 9
> -#define WEIGHT_LUT_SIZE  (1< +#define WEIGHT_LUT_SIZE  (80) // need to >  300 * 300 * log(255)

So the LUT is now 3.2MB?

Why 300? 300*300*log(255) is closer to 500 000 than 800 000

[...]

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V2] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-02-01 Thread [email protected]
On Fri, Feb 1, 2019 at 4:29 PM Clément Bœsch  wrote:
>
> On Fri, Feb 01, 2019 at 10:45:24AM +0800, Jun Zhao wrote:
> > Remove the pdiff_lut_scale in nlmeans and increase weight_lut table size
> > from 2^9 to 80, this change will avoid using pdiff_lut_scale in
> > nlmeans_slice() for weight_lut table search, it's will improve the
> > performance about 12%. (in 1080P size picture case).
> >
> > Use the profiling command like:
> >
> > perf stat -a -d -r 5 ./ffmpeg -i input -an -vf nlmeans=s=30 -vframes 10 \
> > -f null /dev/null
> >
> > without this change:
> > when s=1.0(default value) 63s
> >  s=30.0   72s
> >
> > after this change:
> >  s=1.0(default value) 56s
> >  s=30.0   63s
>
> Nice.
>
> I assume this is tested on x86_64?
Yes
>
> >
> > Reviewed-by: Carl Eugen Hoyos 
> > Signed-off-by: Jun Zhao 
> > ---
> >  libavfilter/vf_nlmeans.c |   12 
> >  1 files changed, 4 insertions(+), 8 deletions(-)
> >
> > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> > index 82e779c..72eb819 100644
> > --- a/libavfilter/vf_nlmeans.c
> > +++ b/libavfilter/vf_nlmeans.c
> > @@ -43,8 +43,7 @@ struct weighted_avg {
> >  float sum;
> >  };
> >
> > -#define WEIGHT_LUT_NBITS 9
> > -#define WEIGHT_LUT_SIZE  (1< > +#define WEIGHT_LUT_SIZE  (80) // need to >  300 * 300 * log(255)
>
> So the LUT is now 3.2MB?
>
> Why 300? 300*300*log(255) is closer to 500 000 than 800 000
I just seleted a value > 300*300*log(255) (500 000 more precise) for
this case at liberty in fact , the other option is use a dynamic
allocation memory for weight_lut table size base on the
max_meaningful_diff :), but maybe seems pretty obvious, I think 3M is
not a big burden for nlmeans
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V2] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-02-01 Thread Clément Bœsch
On Fri, Feb 01, 2019 at 04:57:37PM +0800, [email protected] wrote:
[...]
> > > -#define WEIGHT_LUT_NBITS 9
> > > -#define WEIGHT_LUT_SIZE  (1< > > +#define WEIGHT_LUT_SIZE  (80) // need to >  300 * 300 * log(255)
> >
> > So the LUT is now 3.2MB?
> >
> > Why 300? 300*300*log(255) is closer to 500 000 than 800 000
> I just seleted a value > 300*300*log(255) (500 000 more precise) for
> this case at liberty in fact , the other option is use a dynamic
> allocation memory for weight_lut table size base on the
> max_meaningful_diff :), but maybe seems pretty obvious, I think 3M is
> not a big burden for nlmeans

It's probably fine yes, I'm just confused at the comment: why does it
*needs* to be > 300 * 300 * log(255)?

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V2] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-02-01 Thread [email protected]
On Fri, Feb 1, 2019 at 5:11 PM Clément Bœsch  wrote:
>
> On Fri, Feb 01, 2019 at 04:57:37PM +0800, [email protected] wrote:
> [...]
> > > > -#define WEIGHT_LUT_NBITS 9
> > > > -#define WEIGHT_LUT_SIZE  (1< > > > +#define WEIGHT_LUT_SIZE  (80) // need to >  300 * 300 *
log(255)
> > >
> > > So the LUT is now 3.2MB?
> > >
> > > Why 300? 300*300*log(255) is closer to 500 000 than 800 000
> > I just seleted a value > 300*300*log(255) (500 000 more precise) for
> > this case at liberty in fact , the other option is use a dynamic
> > allocation memory for weight_lut table size base on the
> > max_meaningful_diff :), but maybe seems pretty obvious, I think 3M is
> > not a big burden for nlmeans
>
> It's probably fine yes, I'm just confused at the comment: why does it
> *needs* to be > 300 * 300 * log(255)?
>
> --
ohhh, 300 = max(s) * 10 :), max(s) = 30, this is the reason.

In fact, max size of WEIGHT_LUT_SIZE ==  max (max_meaningful_diff), then we
can avoid use pdiff_lut_scale in nlmeans, becasue now pdiff_lut_scale == 1.
:)

and max( max_meaningful_diff ) = -log(-1/255.0) * h * h = log(255) * max
(h) * max(h) = log(255) * max (10*s) * max(10*s)
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V2] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-02-01 Thread Clément Bœsch
On Fri, Feb 01, 2019 at 05:19:53PM +0800, [email protected] wrote:
> On Fri, Feb 1, 2019 at 5:11 PM Clément Bœsch  wrote:
> >
> > On Fri, Feb 01, 2019 at 04:57:37PM +0800, [email protected] wrote:
> > [...]
> > > > > -#define WEIGHT_LUT_NBITS 9
> > > > > -#define WEIGHT_LUT_SIZE  (1< > > > > +#define WEIGHT_LUT_SIZE  (80) // need to >  300 * 300 *
> log(255)
> > > >
> > > > So the LUT is now 3.2MB?
> > > >
> > > > Why 300? 300*300*log(255) is closer to 500 000 than 800 000
> > > I just seleted a value > 300*300*log(255) (500 000 more precise) for
> > > this case at liberty in fact , the other option is use a dynamic
> > > allocation memory for weight_lut table size base on the
> > > max_meaningful_diff :), but maybe seems pretty obvious, I think 3M is
> > > not a big burden for nlmeans
> >
> > It's probably fine yes, I'm just confused at the comment: why does it
> > *needs* to be > 300 * 300 * log(255)?
> >
> > --
> ohhh, 300 = max(s) * 10 :), max(s) = 30, this is the reason.
> 
> In fact, max size of WEIGHT_LUT_SIZE ==  max (max_meaningful_diff), then we
> can avoid use pdiff_lut_scale in nlmeans, becasue now pdiff_lut_scale == 1.
> :)
> 
> and max( max_meaningful_diff ) = -log(-1/255.0) * h * h = log(255) * max
> (h) * max(h) = log(255) * max (10*s) * max(10*s)

Ok, makes sense. Would you mind updating the comment to something like:

/* Note: WEIGHT_LUT_SIZE must be larger than max_meaningful_diff
 * (log(255)*max(h)^2, which is approximately 50 with the current
 * maximum sigma of 30). The current value is arbitrary and could be
 * tweaked or defined dynamically. */
#define WEIGHT_LUT_SIZE 80

I will test your patch tonight (let's say in about 10 hours given my
current timezone) and apply if everything works fine.

Thanks

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V2] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-02-01 Thread [email protected]
On Fri, Feb 1, 2019 at 5:43 PM Clément Bœsch  wrote:

> On Fri, Feb 01, 2019 at 05:19:53PM +0800, [email protected] wrote:
> > On Fri, Feb 1, 2019 at 5:11 PM Clément Bœsch  wrote:
> > >
> > > On Fri, Feb 01, 2019 at 04:57:37PM +0800, [email protected] wrote:
> > > [...]
> > > > > > -#define WEIGHT_LUT_NBITS 9
> > > > > > -#define WEIGHT_LUT_SIZE  (1< > > > > > +#define WEIGHT_LUT_SIZE  (80) // need to >  300 * 300 *
> > log(255)
> > > > >
> > > > > So the LUT is now 3.2MB?
> > > > >
> > > > > Why 300? 300*300*log(255) is closer to 500 000 than 800 000
> > > > I just seleted a value > 300*300*log(255) (500 000 more precise) for
> > > > this case at liberty in fact , the other option is use a dynamic
> > > > allocation memory for weight_lut table size base on the
> > > > max_meaningful_diff :), but maybe seems pretty obvious, I think 3M is
> > > > not a big burden for nlmeans
> > >
> > > It's probably fine yes, I'm just confused at the comment: why does it
> > > *needs* to be > 300 * 300 * log(255)?
> > >
> > > --
> > ohhh, 300 = max(s) * 10 :), max(s) = 30, this is the reason.
> >
> > In fact, max size of WEIGHT_LUT_SIZE ==  max (max_meaningful_diff), then
> we
> > can avoid use pdiff_lut_scale in nlmeans, becasue now pdiff_lut_scale ==
> 1.
> > :)
> >
> > and max( max_meaningful_diff ) = -log(-1/255.0) * h * h = log(255) * max
> > (h) * max(h) = log(255) * max (10*s) * max(10*s)
>
> Ok, makes sense. Would you mind updating the comment to something like:
>
> /* Note: WEIGHT_LUT_SIZE must be larger than max_meaningful_diff
>  * (log(255)*max(h)^2, which is approximately 50 with the current
>  * maximum sigma of 30). The current value is arbitrary and could be
>  * tweaked or defined dynamically. */
> #define WEIGHT_LUT_SIZE 80
>
> I will test your patch tonight (let's say in about 10 hours given my
> current timezone) and apply if everything works fine.
>
> it's OK, and I think you can change   80  to 50 at the same time
if the patch is Ok. Tks
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] doc/filters: document ranges and defaults for nlmeans options

2019-02-01 Thread Gyan



On 01-02-2019 11:43 AM, Jun Zhao wrote:

document ranges and defaults for nlmeans options

Signed-off-by: Jun Zhao 
---
  doc/filters.texi |6 +++---
  1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/filters.texi b/doc/filters.texi
index fc98323..d588315 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -12296,10 +12296,10 @@ The filter accepts the following options.
  
  @table @option

  @item s
-Set denoising strength.
+Set denoising strength. Default is 1.0. Must be in range [1.0, 30.0].
  
  @item p

-Set patch size.
+Set patch size. Default is 7. Must be odd number in range [0, 99].
  
  @item pc

  Same as @option{p} but for chroma planes.
@@ -12307,7 +12307,7 @@ Same as @option{p} but for chroma planes.
  The default value is @var{0} and means automatic.
  
  @item r

-Set research size.
+Set research size. Default is 15. Must be odd number in range [0, 99].
  
  @item rc

  Same as @option{r} but for chroma planes.

LGTM.

Gyan
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V2] lavfi/vf_nlmeans: Improve the performance for nlmeans

2019-02-01 Thread Clément Bœsch
On Fri, Feb 01, 2019 at 05:50:37PM +0800, [email protected] wrote:
[...]
> > Ok, makes sense. Would you mind updating the comment to something like:
> >
> > /* Note: WEIGHT_LUT_SIZE must be larger than max_meaningful_diff
> >  * (log(255)*max(h)^2, which is approximately 50 with the current
> >  * maximum sigma of 30). The current value is arbitrary and could be
> >  * tweaked or defined dynamically. */
> > #define WEIGHT_LUT_SIZE 80
> >
> > I will test your patch tonight (let's say in about 10 hours given my
> > current timezone) and apply if everything works fine.
> >
> it's OK, and I think you can change   80  to 50 at the same time
> if the patch is Ok. Tks

I did change it to 50, added a comment, updated the description, and
pushed. I also made the dynamic change you suggested earlier on the weight
LUT.

Thanks for the patch,

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 3/4] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-05 Thread Jun Zhao
From: Jun Zhao 

accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
data type, it will have a risk of an integral value becoming larger than
the 32-bits integer capacity and resulting in an integer overflow. For
this risk, add a checking with warning message.

Signed-off-by: Jun Zhao 
---
 libavfilter/vf_nlmeans.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index dcb5a03..31c8304 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const 
NLMeansDSPContext *dsp,
 // adjusted end x position of the safe area after width of the safe area 
gets aligned
 const int endx_safe = startx_safe + safe_pw;
 
+// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
+// data type, it will have a risk of an integral value becoming larger than
+// the 32-bits integer capacity and resulting in an integer overflow.
+if ((w * h * UINT8_MAX) > UINT32_MAX)
+av_log(NULL, AV_LOG_WARNING,
+   "image (%d x %d) integral value maybe overflow.\n", w ,h);
+
 // top part where only one of s1 and s2 is still readable, or none at all
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
   0, 0,
-- 
1.7.1

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 3/4] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-05 Thread Carl Eugen Hoyos
2019-03-06 4:18 GMT+01:00, Jun Zhao :
> From: Jun Zhao 
>
> accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
> data type, it will have a risk of an integral value becoming larger than
> the 32-bits integer capacity and resulting in an integer overflow. For
> this risk, add a checking with warning message.
>
> Signed-off-by: Jun Zhao 
> ---
>  libavfilter/vf_nlmeans.c |7 +++
>  1 files changed, 7 insertions(+), 0 deletions(-)
>
> diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> index dcb5a03..31c8304 100644
> --- a/libavfilter/vf_nlmeans.c
> +++ b/libavfilter/vf_nlmeans.c
> @@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const
> NLMeansDSPContext *dsp,
>  // adjusted end x position of the safe area after width of the safe
> area gets aligned
>  const int endx_safe = startx_safe + safe_pw;
>
> +// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t
> *ii)
> +// data type, it will have a risk of an integral value becoming larger
> than
> +// the 32-bits integer capacity and resulting in an integer overflow.
> +if ((w * h * UINT8_MAX) > UINT32_MAX)

I don't think UINT8_MAX increases readability and I suspect
this should contain "UINT32_MAX / (w*h)" or similar on
one side.

> +av_log(NULL, AV_LOG_WARNING,
> +   "image (%d x %d) integral value maybe overflow.\n", w ,h);

may overflow?

Carl Eugen
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 3/4] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-06 Thread [email protected]
On Wed, Mar 6, 2019 at 3:55 PM Carl Eugen Hoyos  wrote:
>
> 2019-03-06 4:18 GMT+01:00, Jun Zhao :
> > From: Jun Zhao 
> >
> > accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
> > data type, it will have a risk of an integral value becoming larger than
> > the 32-bits integer capacity and resulting in an integer overflow. For
> > this risk, add a checking with warning message.
> >
> > Signed-off-by: Jun Zhao 
> > ---
> >  libavfilter/vf_nlmeans.c |7 +++
> >  1 files changed, 7 insertions(+), 0 deletions(-)
> >
> > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> > index dcb5a03..31c8304 100644
> > --- a/libavfilter/vf_nlmeans.c
> > +++ b/libavfilter/vf_nlmeans.c
> > @@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const
> > NLMeansDSPContext *dsp,
> >  // adjusted end x position of the safe area after width of the safe
> > area gets aligned
> >  const int endx_safe = startx_safe + safe_pw;
> >
> > +// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t
> > *ii)
> > +// data type, it will have a risk of an integral value becoming larger
> > than
> > +// the 32-bits integer capacity and resulting in an integer overflow.
> > +if ((w * h * UINT8_MAX) > UINT32_MAX)
>
> I don't think UINT8_MAX increases readability and I suspect
> this should contain "UINT32_MAX / (w*h)" or similar on
> one side.
>
You means like: UINT32_MAX/w < (UINT8_MAX * h) ?
> > +av_log(NULL, AV_LOG_WARNING,
> > +   "image (%d x %d) integral value maybe overflow.\n", w ,h);
>
> may overflow?
Will update the warning message as the suggestion.Tks
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 3/4] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-06 Thread Carl Eugen Hoyos
2019-03-06 12:31 GMT+01:00, [email protected] :
> On Wed, Mar 6, 2019 at 3:55 PM Carl Eugen Hoyos  wrote:
>>
>> 2019-03-06 4:18 GMT+01:00, Jun Zhao :

>> > +// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits
>> > (uint32_t *ii)
>> > +// data type, it will have a risk of an integral value becoming
>> > larger than
>> > +// the 32-bits integer capacity and resulting in an integer
>> > overflow.
>> > +if ((w * h * UINT8_MAX) > UINT32_MAX)
>>
>> I don't think UINT8_MAX increases readability and I suspect
>> this should contain "UINT32_MAX / (w*h)" or similar on
>> one side.
>>
> You means like: UINT32_MAX/w < (UINT8_MAX * h) ?

Actually: (UINT32_MAX / w < 255 * h)
(But that may only be me)

Carl Eugen
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH V2] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-06 Thread Jun Zhao
From: Jun Zhao 

accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
data type, it will have a risk of an integral value becoming larger than
the 32-bits integer capacity and resulting in an integer overflow. For
this risk, add a checking with warning message.

Signed-off-by: Jun Zhao 
---
 libavfilter/vf_nlmeans.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index dcb5a03..e7015cd 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const 
NLMeansDSPContext *dsp,
 // adjusted end x position of the safe area after width of the safe area 
gets aligned
 const int endx_safe = startx_safe + safe_pw;
 
+// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
+// data type, it will have a risk of an integral value becoming larger than
+// the 32-bits integer capacity and resulting in an integer overflow.
+if ((UINT32_MAX / (uint64_t)w) < (UINT8_MAX * (uint64_t)h))
+av_log(NULL, AV_LOG_WARNING,
+   "image (%d x %d) integral value maybe overflow.\n", w ,h);
+
 // top part where only one of s1 and s2 is still readable, or none at all
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
   0, 0,
-- 
1.7.1

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V2] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-06 Thread [email protected]
On Thu, Mar 7, 2019 at 9:39 AM Jun Zhao  wrote:
>
> From: Jun Zhao 
>
> accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
> data type, it will have a risk of an integral value becoming larger than
> the 32-bits integer capacity and resulting in an integer overflow. For
> this risk, add a checking with warning message.
>
> Signed-off-by: Jun Zhao 
> ---
>  libavfilter/vf_nlmeans.c |7 +++
>  1 files changed, 7 insertions(+), 0 deletions(-)
>
> diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> index dcb5a03..e7015cd 100644
> --- a/libavfilter/vf_nlmeans.c
> +++ b/libavfilter/vf_nlmeans.c
> @@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const 
> NLMeansDSPContext *dsp,
>  // adjusted end x position of the safe area after width of the safe area 
> gets aligned
>  const int endx_safe = startx_safe + safe_pw;
>
> +// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t 
> *ii)
> +// data type, it will have a risk of an integral value becoming larger 
> than
> +// the 32-bits integer capacity and resulting in an integer overflow.
> +if ((UINT32_MAX / (uint64_t)w) < (UINT8_MAX * (uint64_t)h))
> +av_log(NULL, AV_LOG_WARNING,
> +   "image (%d x %d) integral value maybe overflow.\n", w ,h);
> +
>  // top part where only one of s1 and s2 is still readable, or none at all
>  compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
>0, 0,
> --
> 1.7.1
Pls ignore this patch, send a wrong patch
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH V3] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-06 Thread Jun Zhao
From: Jun Zhao 

accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
data type, it will have a risk of an integral value becoming larger than
the 32-bits integer capacity and resulting in an integer overflow. For
this risk, add a checking with warning message.

Signed-off-by: Jun Zhao 
---
 libavfilter/vf_nlmeans.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index dcb5a03..9876aae 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const 
NLMeansDSPContext *dsp,
 // adjusted end x position of the safe area after width of the safe area 
gets aligned
 const int endx_safe = startx_safe + safe_pw;
 
+// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
+// data type, it will have a risk of an integral value becoming larger than
+// the 32-bits integer capacity and resulting in an integer overflow.
+if ((UINT32_MAX / (uint64_t)w) < (255 * (uint64_t)h))
+av_log(NULL, AV_LOG_WARNING,
+   "image (%d x %d) integral value may overflow.\n", w ,h);
+
 // top part where only one of s1 and s2 is still readable, or none at all
 compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
   0, 0,
-- 
1.7.1

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 3/4] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-06 Thread [email protected]
On Wed, Mar 6, 2019 at 9:15 PM Carl Eugen Hoyos  wrote:
>
> 2019-03-06 12:31 GMT+01:00, [email protected] :
> > On Wed, Mar 6, 2019 at 3:55 PM Carl Eugen Hoyos 
wrote:
> >>
> >> 2019-03-06 4:18 GMT+01:00, Jun Zhao :
>
> >> > +// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits
> >> > (uint32_t *ii)
> >> > +// data type, it will have a risk of an integral value becoming
> >> > larger than
> >> > +// the 32-bits integer capacity and resulting in an integer
> >> > overflow.
> >> > +if ((w * h * UINT8_MAX) > UINT32_MAX)
> >>
> >> I don't think UINT8_MAX increases readability and I suspect
> >> this should contain "UINT32_MAX / (w*h)" or similar on
> >> one side.
> >>
> > You means like: UINT32_MAX/w < (UINT8_MAX * h) ?
>
> Actually: (UINT32_MAX / w < 255 * h)
> (But that may only be me)
>
Update V3 patch for this check in https://patchwork.ffmpeg.org/patch/12230/,
Tks
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V3] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-07 Thread Michael Niedermayer
On Thu, Mar 07, 2019 at 09:45:12AM +0800, Jun Zhao wrote:
> From: Jun Zhao 
> 
> accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
> data type, it will have a risk of an integral value becoming larger than
> the 32-bits integer capacity and resulting in an integer overflow. For
> this risk, add a checking with warning message.
> 
> Signed-off-by: Jun Zhao 
> ---
>  libavfilter/vf_nlmeans.c |7 +++
>  1 files changed, 7 insertions(+), 0 deletions(-)
> 
> diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> index dcb5a03..9876aae 100644
> --- a/libavfilter/vf_nlmeans.c
> +++ b/libavfilter/vf_nlmeans.c
> @@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const 
> NLMeansDSPContext *dsp,
>  // adjusted end x position of the safe area after width of the safe area 
> gets aligned
>  const int endx_safe = startx_safe + safe_pw;
>  
> +// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t 
> *ii)
> +// data type, it will have a risk of an integral value becoming larger 
> than
> +// the 32-bits integer capacity and resulting in an integer overflow.
> +if ((UINT32_MAX / (uint64_t)w) < (255 * (uint64_t)h))
> +av_log(NULL, AV_LOG_WARNING,
> +   "image (%d x %d) integral value may overflow.\n", w ,h);

Printing a warning is not an adequate response for a integer overflow.
Such thing is undefined behavior (in case signed of signed int) and must
not occur.

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

I have never wished to cater to the crowd; for what I know they do not
approve, and what they approve I do not know. -- Epicurus


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V3] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-07 Thread Michael Niedermayer
On Thu, Mar 07, 2019 at 09:18:42PM +0100, Michael Niedermayer wrote:
> On Thu, Mar 07, 2019 at 09:45:12AM +0800, Jun Zhao wrote:
> > From: Jun Zhao 
> > 
> > accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
> > data type, it will have a risk of an integral value becoming larger than
> > the 32-bits integer capacity and resulting in an integer overflow. For
> > this risk, add a checking with warning message.
> > 
> > Signed-off-by: Jun Zhao 
> > ---
> >  libavfilter/vf_nlmeans.c |7 +++
> >  1 files changed, 7 insertions(+), 0 deletions(-)
> > 
> > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> > index dcb5a03..9876aae 100644
> > --- a/libavfilter/vf_nlmeans.c
> > +++ b/libavfilter/vf_nlmeans.c
> > @@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const 
> > NLMeansDSPContext *dsp,
> >  // adjusted end x position of the safe area after width of the safe 
> > area gets aligned
> >  const int endx_safe = startx_safe + safe_pw;
> >  
> > +// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t 
> > *ii)
> > +// data type, it will have a risk of an integral value becoming larger 
> > than
> > +// the 32-bits integer capacity and resulting in an integer overflow.
> > +if ((UINT32_MAX / (uint64_t)w) < (255 * (uint64_t)h))
> > +av_log(NULL, AV_LOG_WARNING,
> > +   "image (%d x %d) integral value may overflow.\n", w ,h);
> 
> Printing a warning is not an adequate response for a integer overflow.
> Such thing is undefined behavior (in case signed of signed int) and must
> not occur.

And if no signed ints are involved, while this is then not undefined it still
gives the wrong result. Thats a bug, the bug should be fixed not a warning
be printed that the bug might be triggered

Thanks

[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

There will always be a question for which you do not know the correct answer.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V3] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-07 Thread [email protected]
On Fri, Mar 8, 2019 at 4:28 AM Michael Niedermayer 
wrote:

> On Thu, Mar 07, 2019 at 09:18:42PM +0100, Michael Niedermayer wrote:
> > On Thu, Mar 07, 2019 at 09:45:12AM +0800, Jun Zhao wrote:
> > > From: Jun Zhao 
> > >
> > > accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t
> *ii)
> > > data type, it will have a risk of an integral value becoming larger
> than
> > > the 32-bits integer capacity and resulting in an integer overflow. For
> > > this risk, add a checking with warning message.
> > >
> > > Signed-off-by: Jun Zhao 
> > > ---
> > >  libavfilter/vf_nlmeans.c |7 +++
> > >  1 files changed, 7 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> > > index dcb5a03..9876aae 100644
> > > --- a/libavfilter/vf_nlmeans.c
> > > +++ b/libavfilter/vf_nlmeans.c
> > > @@ -236,6 +236,13 @@ static void compute_ssd_integral_image(const
> NLMeansDSPContext *dsp,
> > >  // adjusted end x position of the safe area after width of the
> safe area gets aligned
> > >  const int endx_safe = startx_safe + safe_pw;
> > >
> > > +// accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits
> (uint32_t *ii)
> > > +// data type, it will have a risk of an integral value becoming
> larger than
> > > +// the 32-bits integer capacity and resulting in an integer
> overflow.
> > > +if ((UINT32_MAX / (uint64_t)w) < (255 * (uint64_t)h))
> > > +av_log(NULL, AV_LOG_WARNING,
> > > +   "image (%d x %d) integral value may overflow.\n", w
> ,h);
> >
> > Printing a warning is not an adequate response for a integer overflow.
> > Such thing is undefined behavior (in case signed of signed int) and must
> > not occur.
>
> And if no signed ints are involved, while this is then not undefined it
> still
> gives the wrong result. Thats a bug, the bug should be fixed not a warning
> be printed that the bug might be triggered
>
> Thanks
>
>
Will give a stricter w * h check for this case, Tks
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH V4 1/2] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-08 Thread Jun Zhao
From: Jun Zhao 

accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
data type, it will have a risk of an integral value becoming larger than
the 32-bits integer capacity and resulting in an integer overflow. For
this risk, add a checking with warning message.

Signed-off-by: Jun Zhao 
---
 libavfilter/vf_nlmeans.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index dcb5a03..8d47f9d 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -477,6 +477,17 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 NLMeansContext *s = ctx->priv;
 AVFilterLink *outlink = ctx->outputs[0];
 
+// accumulation of 8-bits uint_8 into 32-bits data type, it will have
+// a risk of an integral value becoming larger than the 32-bits integer
+// capacity and resulting in an integer overflow, so limit the image size
+if ((UINT32_MAX / (uint64_t)inlink->w) < (255 * (uint64_t)inlink->h)) {
+av_log(ctx, AV_LOG_ERROR,
+   "image size (%d x %d) integral value may overflow.\n",
+   inlink->w, inlink->h);
+av_frame_free(&in);
+return AVERROR(EINVAL);
+}
+
 AVFrame *out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
 if (!out) {
 av_frame_free(&in);
-- 
1.7.1

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH V4 2/2] lavfi/nlmeans: fix mixed declarations and code

2019-03-08 Thread Jun Zhao
From: Jun Zhao 

fix mixed declarations and code in C90 after last change

Signed-off-by: Jun Zhao 
---
 libavfilter/vf_nlmeans.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index 8d47f9d..7497df2 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -476,6 +476,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 AVFilterContext *ctx = inlink->dst;
 NLMeansContext *s = ctx->priv;
 AVFilterLink *outlink = ctx->outputs[0];
+AVFrame *out;
 
 // accumulation of 8-bits uint_8 into 32-bits data type, it will have
 // a risk of an integral value becoming larger than the 32-bits integer
@@ -488,7 +489,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 return AVERROR(EINVAL);
 }
 
-AVFrame *out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
 if (!out) {
 av_frame_free(&in);
 return AVERROR(ENOMEM);
-- 
1.7.1

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V4 1/2] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-08 Thread Paul B Mahol
On 3/8/19, Jun Zhao  wrote:
> From: Jun Zhao 
>
> accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
> data type, it will have a risk of an integral value becoming larger than
> the 32-bits integer capacity and resulting in an integer overflow. For
> this risk, add a checking with warning message.
>
> Signed-off-by: Jun Zhao 
> ---
>  libavfilter/vf_nlmeans.c |   11 +++
>  1 files changed, 11 insertions(+), 0 deletions(-)
>
> diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> index dcb5a03..8d47f9d 100644
> --- a/libavfilter/vf_nlmeans.c
> +++ b/libavfilter/vf_nlmeans.c
> @@ -477,6 +477,17 @@ static int filter_frame(AVFilterLink *inlink, AVFrame
> *in)
>  NLMeansContext *s = ctx->priv;
>  AVFilterLink *outlink = ctx->outputs[0];
>
> +// accumulation of 8-bits uint_8 into 32-bits data type, it will have
> +// a risk of an integral value becoming larger than the 32-bits integer
> +// capacity and resulting in an integer overflow, so limit the image
> size
> +if ((UINT32_MAX / (uint64_t)inlink->w) < (255 * (uint64_t)inlink->h)) {
> +av_log(ctx, AV_LOG_ERROR,
> +   "image size (%d x %d) integral value may overflow.\n",
> +   inlink->w, inlink->h);
> +av_frame_free(&in);
> +return AVERROR(EINVAL);
> +}
> +
>  AVFrame *out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
>  if (!out) {
>  av_frame_free(&in);

I see no point in this warning, if overflow is real issue should be
fixed instead of giving
pointless warning.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V4 1/2] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-08 Thread [email protected]
On Fri, Mar 8, 2019 at 5:26 PM Paul B Mahol  wrote:
>
> On 3/8/19, Jun Zhao  wrote:
> > From: Jun Zhao 
> >
> > accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
> > data type, it will have a risk of an integral value becoming larger than
> > the 32-bits integer capacity and resulting in an integer overflow. For
> > this risk, add a checking with warning message.
> >
> > Signed-off-by: Jun Zhao 
> > ---
> >  libavfilter/vf_nlmeans.c |   11 +++
> >  1 files changed, 11 insertions(+), 0 deletions(-)
> >
> > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
> > index dcb5a03..8d47f9d 100644
> > --- a/libavfilter/vf_nlmeans.c
> > +++ b/libavfilter/vf_nlmeans.c
> > @@ -477,6 +477,17 @@ static int filter_frame(AVFilterLink *inlink, AVFrame
> > *in)
> >  NLMeansContext *s = ctx->priv;
> >  AVFilterLink *outlink = ctx->outputs[0];
> >
> > +// accumulation of 8-bits uint_8 into 32-bits data type, it will have
> > +// a risk of an integral value becoming larger than the 32-bits integer
> > +// capacity and resulting in an integer overflow, so limit the image
> > size
> > +if ((UINT32_MAX / (uint64_t)inlink->w) < (255 * (uint64_t)inlink->h)) {
> > +av_log(ctx, AV_LOG_ERROR,
> > +   "image size (%d x %d) integral value may overflow.\n",
> > +   inlink->w, inlink->h);
> > +av_frame_free(&in);
> > +return AVERROR(EINVAL);
> > +}
> > +
> >  AVFrame *out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> >  if (!out) {
> >  av_frame_free(&in);
>
> I see no point in this warning, if overflow is real issue should be
> fixed instead of giving
> pointless warning.
In fact, this is a potential overflow problems depend on image
value/width/height when calculating integral image(Summed-area_table
is the other name https://en.wikipedia.org/wiki/Summed-area_table),
this is the reason to limit the image size in this patch to avoid this
potential overflow problems, I don't know what's the mean for " should
be fixed instead of giving pointless warning.", can you give more
information for this? thx.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH V4 1/2] lavfi/nlmeans: Checking number precision when computing integral images

2019-03-08 Thread Paul B Mahol
On 3/8/19, [email protected]  wrote:
> On Fri, Mar 8, 2019 at 5:26 PM Paul B Mahol  wrote:
>>
>> On 3/8/19, Jun Zhao  wrote:
>> > From: Jun Zhao 
>> >
>> > accumulation of 8-bits uint_8 (uint8_t *src) into 32-bits (uint32_t *ii)
>> > data type, it will have a risk of an integral value becoming larger than
>> > the 32-bits integer capacity and resulting in an integer overflow. For
>> > this risk, add a checking with warning message.
>> >
>> > Signed-off-by: Jun Zhao 
>> > ---
>> >  libavfilter/vf_nlmeans.c |   11 +++
>> >  1 files changed, 11 insertions(+), 0 deletions(-)
>> >
>> > diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
>> > index dcb5a03..8d47f9d 100644
>> > --- a/libavfilter/vf_nlmeans.c
>> > +++ b/libavfilter/vf_nlmeans.c
>> > @@ -477,6 +477,17 @@ static int filter_frame(AVFilterLink *inlink,
>> > AVFrame
>> > *in)
>> >  NLMeansContext *s = ctx->priv;
>> >  AVFilterLink *outlink = ctx->outputs[0];
>> >
>> > +// accumulation of 8-bits uint_8 into 32-bits data type, it will
>> > have
>> > +// a risk of an integral value becoming larger than the 32-bits
>> > integer
>> > +// capacity and resulting in an integer overflow, so limit the
>> > image
>> > size
>> > +if ((UINT32_MAX / (uint64_t)inlink->w) < (255 *
>> > (uint64_t)inlink->h)) {
>> > +av_log(ctx, AV_LOG_ERROR,
>> > +   "image size (%d x %d) integral value may overflow.\n",
>> > +   inlink->w, inlink->h);
>> > +av_frame_free(&in);
>> > +return AVERROR(EINVAL);
>> > +}
>> > +
>> >  AVFrame *out = ff_get_video_buffer(outlink, outlink->w,
>> > outlink->h);
>> >  if (!out) {
>> >  av_frame_free(&in);
>>
>> I see no point in this warning, if overflow is real issue should be
>> fixed instead of giving
>> pointless warning.
> In fact, this is a potential overflow problems depend on image
> value/width/height when calculating integral image(Summed-area_table
> is the other name https://en.wikipedia.org/wiki/Summed-area_table),
> this is the reason to limit the image size in this patch to avoid this
> potential overflow problems, I don't know what's the mean for " should
> be fixed instead of giving pointless warning.", can you give more
> information for this? thx.

Use uint64_t type.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-07-27 Thread Dylan Fernando

From 2ddd2f5e0d9559bbbf46de3d7cfb7ffbbdbefceb Mon Sep 17 00:00:00 2001
From: Dylan Fernando 
Date: Tue, 27 Jul 2021 19:25:59 +
Subject: [PATCH] lavfi: add nlmeans_cuda filter

---
 compat/cuda/cuda_runtime.h |   1 +
 configure  |   2 +
 doc/filters.texi   |   4 +
 libavfilter/Makefile   |   2 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/vf_nlmeans_cuda.c  | 814 +
 libavfilter/vf_nlmeans_cuda.cu | 361 +++
 7 files changed, 1185 insertions(+)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h
index c5450b2542..c1e2143dde 100644
--- a/compat/cuda/cuda_runtime.h
+++ b/compat/cuda/cuda_runtime.h
@@ -184,5 +184,6 @@ static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
 
 static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
 static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
+static inline __device__ float exp(float a) { return __nvvm_ex2_approx_f(a); }
 
 #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */
diff --git a/configure b/configure
index 646d16e3c9..96a6fcde7d 100755
--- a/configure
+++ b/configure
@@ -3094,6 +3094,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index 66c0f87e47..a0b68fc49f 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15228,6 +15228,10 @@ Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 49c0c8342b..565923d85a 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -341,6 +341,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)   += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+cuda/load_helper.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)   += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index ae74f9c891..5fcdfecfbc 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -327,6 +327,7 @@ extern const AVFilter ff_vf_msad;
 extern const AVFilter ff_vf_negate;
 extern const AVFilter ff_vf_nlmeans;
 extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_cuda;
 extern const AVFilter ff_vf_nnedi;
 extern const AVFilter ff_vf_noformat;
 extern const AVFilter ff_vf_noise;
diff --git a/libavfilter/vf_nlmeans_cuda.c b/libavfilter/vf_nlmeans_cuda.c
new file mode 100644
index 00..fd7e649556
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.c
@@ -0,0 +1,814 @@
+
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+AV_PIX_FMT_YUV420P,
+AV_PIX_FMT_NV12,
+AV_PIX_FMT_YUV444P,
+AV_PIX_FMT_P010,
+AV_PIX_FMT_P016,
+AV_PIX_FMT_YUV444P16,
+AV_PIX_FMT_0RGB32,
+AV_PIX_FMT_0BGR32,
+};
+
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+
+
+typedef struct NLMeansCudaContext {
+const AVClass *class;
+
+doublesigma;
+int   patch_size;
+int   patch_size_uv;
+int   research_size;
+int   research_size_uv;
+int   initialised;
+
+float h;
+
+AVBufferRef *hw_frames_ctx;
+AVCUDADeviceContext *hwctx;
+
+CUmodulecu_module;
+
+CUfunction  cu_func_horiz_uchar;
+CUfunction  cu_func_horiz_uchar2;

Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-07-27 Thread Timo Rothenpieler
I'm super loaded with work this week already, so I won't have a chance 
to look at this before some time next week.


First glance looks fine though and I'll come back to you with a proper 
review next week!




smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-13 Thread Dylan Fernando
Any update on this?

Kind Regards,
Dylan
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-13 Thread Timo Rothenpieler

On 13.08.2021 10:42, Dylan Fernando wrote:

Any update on this?


Missing license header in both new files (please re-send for this).
Missing version bump, though I can add this when pushing as well.

The code looks fine to me, though I have no experience with this 
algorithm at all, so if someone who does could give it a look, that'd be 
greatly appreciated.




smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-13 Thread Timo Rothenpieler

On 13.08.2021 10:42, Dylan Fernando wrote:

Any update on this?

Kind Regards,
Dylan


Also, are you sure that exp() function is correct?

The CUDA-Function exp() is defined as "double exp(double x)" and 
calculates the base e exponential.


While __nvvm_ex2_approx_f reads to me like it does so for floats, and 
for base 2. For which the CUDA equivalent would be "float exp2f(float)".




smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-13 Thread Dylan Fernando
On Sat, Aug 14, 2021 at 9:11 AM Timo Rothenpieler 
wrote:

> On 13.08.2021 10:42, Dylan Fernando wrote:
> > Any update on this?
> >
> > Kind Regards,
> > Dylan
>
> Also, are you sure that exp() function is correct?
>
> The CUDA-Function exp() is defined as "double exp(double x)" and
> calculates the base e exponential.
>
> While __nvvm_ex2_approx_f reads to me like it does so for floats, and
> for base 2. For which the CUDA equivalent would be "float exp2f(float)".
>
> ___
> ffmpeg-devel mailing list
> [email protected]
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] with subject "unsubscribe".
>

I wasn't sure about the exp() function. Is there a function like
__nvvm_exp_approx_d? I can't seem to find a function for this.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-14 Thread Timo Rothenpieler

On 14.08.2021 07:49, Dylan Fernando wrote:

On Sat, Aug 14, 2021 at 9:11 AM Timo Rothenpieler 
wrote:


On 13.08.2021 10:42, Dylan Fernando wrote:

Any update on this?

Kind Regards,
Dylan


Also, are you sure that exp() function is correct?

The CUDA-Function exp() is defined as "double exp(double x)" and
calculates the base e exponential.

While __nvvm_ex2_approx_f reads to me like it does so for floats, and
for base 2. For which the CUDA equivalent would be "float exp2f(float)".

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".



I wasn't sure about the exp() function. Is there a function like
__nvvm_exp_approx_d? I can't seem to find a function for this.


Which specific exp function do you actually require? There's a bunch of 
different ones, depending on precision and the base.




smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-14 Thread Timo Rothenpieler

On 14.08.2021 07:49, Dylan Fernando wrote:

On Sat, Aug 14, 2021 at 9:11 AM Timo Rothenpieler 
wrote:


On 13.08.2021 10:42, Dylan Fernando wrote:

Any update on this?

Kind Regards,
Dylan


Also, are you sure that exp() function is correct?

The CUDA-Function exp() is defined as "double exp(double x)" and
calculates the base e exponential.

While __nvvm_ex2_approx_f reads to me like it does so for floats, and
for base 2. For which the CUDA equivalent would be "float exp2f(float)".

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".



I wasn't sure about the exp() function. Is there a function like
__nvvm_exp_approx_d? I can't seem to find a function for this.


Looking into it some more, that's simply because there is no other fast 
approx exp function than ex2.

If I use __expf() with nvcc, it spawns the following code:

ld.param.f32%f1, [param];
mul.f32 %f2, %f1, 0f3FB8AA3B;
ex2.approx.f32  %f3, %f2;

So it multiplies the input value by some factor, and then runs it 
through it.
Given by math, this value must be log2(euler_constant), or log2(exp(1)), 
for lack of the constant being defined.


So the implementation of __expf() would look like this:


static inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * 
(float)__builtin_log2(__builtin_exp(1))); }


With llvm, this now spawns the exact same code:

ld.param.f32%f1, [param];
mul.f32 %f2, %f1, 0f3FB8AA3B;
ex2.approx.f32  %f3, %f2;


I will push that function soon, so you can just use __expf() in your 
code. Assuming you want exp to base e.




smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-15 Thread Dylan Fernando
On Sat, Aug 14, 2021 at 1:03 PM Timo Rothenpieler 
wrote:

> On 14.08.2021 07:49, Dylan Fernando wrote:
> > On Sat, Aug 14, 2021 at 9:11 AM Timo Rothenpieler  >
> > wrote:
> >
> >> On 13.08.2021 10:42, Dylan Fernando wrote:
> >>> Any update on this?
> >>>
> >>> Kind Regards,
> >>> Dylan
> >>
> >> Also, are you sure that exp() function is correct?
> >>
> >> The CUDA-Function exp() is defined as "double exp(double x)" and
> >> calculates the base e exponential.
> >>
> >> While __nvvm_ex2_approx_f reads to me like it does so for floats, and
> >> for base 2. For which the CUDA equivalent would be "float exp2f(float)".
> >>
> >> ___
> >> ffmpeg-devel mailing list
> >> [email protected]
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> [email protected] with subject "unsubscribe".
> >>
> >
> > I wasn't sure about the exp() function. Is there a function like
> > __nvvm_exp_approx_d? I can't seem to find a function for this.
>
> Looking into it some more, that's simply because there is no other fast
> approx exp function than ex2.
> If I use __expf() with nvcc, it spawns the following code:
>
> ld.param.f32%f1, [param];
> mul.f32 %f2, %f1, 0f3FB8AA3B;
> ex2.approx.f32  %f3, %f2;
>
> So it multiplies the input value by some factor, and then runs it
> through it.
> Given by math, this value must be log2(euler_constant), or log2(exp(1)),
> for lack of the constant being defined.
>
> So the implementation of __expf() would look like this:
>
> > static inline __device__ float __expf(float a) { return
> __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); }
>
> With llvm, this now spawns the exact same code:
>
> ld.param.f32%f1, [param];
> mul.f32 %f2, %f1, 0f3FB8AA3B;
> ex2.approx.f32  %f3, %f2;
>
>
> I will push that function soon, so you can just use __expf() in your
> code. Assuming you want exp to base e.
>
> ___
> ffmpeg-devel mailing list
> [email protected]
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] with subject "unsubscribe".
>
> Attatched updated patch
>
From 1b1103f48599f56a84f749c51085c22341a95fa3 Mon Sep 17 00:00:00 2001
From: Dylan Fernando 
Date: Sun, 15 Aug 2021 17:08:07 +
Subject: [PATCH] lavfi: add nlmeans_cuda filter

---
 configure  |   2 +
 doc/filters.texi   |   4 +
 libavfilter/Makefile   |   2 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/version.h  |   2 +-
 libavfilter/vf_nlmeans_cuda.c  | 830 +
 libavfilter/vf_nlmeans_cuda.cu | 378 +++
 7 files changed, 1218 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

diff --git a/configure b/configure
index 82639ce057..0d905cc3c2 100755
--- a/configure
+++ b/configure
@@ -3094,6 +3094,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index bdeb3fedfd..585aff9880 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15234,6 +15234,10 @@ Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 49c0c8342b..565923d85a 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -341,6 +341,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)   += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+

Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-22 Thread Timo Rothenpieler
If nobody wants to review the algorithm being implemented, I'm gonna 
apply this soon.
It looks fine by all I can tell, but I never touched the software 
version of this filter.


smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter

2021-08-27 Thread Timo Rothenpieler

There still are some issues in the C side of this filter:

You're allocating a lot of stuff on init (integral_img, ...) but never 
free it. So the filter leaks those.


You define a list of supported formats at the top, listing practically 
all formats that can be in a CUDA frame as of right now.
But then later infilter_frame you have a switch-case that returns 
AVERROR_BUG if it's anything else than NV12.
So, either add support for all the other formats or don't claim support 
for more than NV12.


I'm also not sure if not taking an internal reference to the 
hw_frames_ctx is valid.

It might, but I'm not sure about the lifetime on the one on the output.
Typically, filters also hold an internal reference and unref it on uninit.


Generally:

There's a bunch of trailing whitespaces everywhere, please tell your 
editor to not do that.


Also, when bumping minor version, the patch version drops back to 100.


smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-09-02 Thread Dylan Fernando
I want to add support for the other formats, but I'm not sure how to find
video files to test it out. I tried looking through
https://samples.ffmpeg.org/, but I'm not sure which files on there are the
formats im looking for (AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P,
AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16).
From 9183993c6b31560ed21fe9dd4c06f7e01735d903 Mon Sep 17 00:00:00 2001
From: Dylan Fernando 
Date: Thu, 2 Sep 2021 17:31:49 +
Subject: [PATCH] lavfi: add nlmeans_cuda filter

Signed-off-by: Dylan Fernando 
---
 configure  |   2 +
 doc/filters.texi   |   4 +
 libavfilter/Makefile   |   2 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/version.h  |   2 +-
 libavfilter/vf_nlmeans_cuda.c  | 850 +
 libavfilter/vf_nlmeans_cuda.cu | 378 +++
 7 files changed, 1238 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

diff --git a/configure b/configure
index 9249254b70..55ed0200c7 100755
--- a/configure
+++ b/configure
@@ -3094,6 +3094,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index 9ad6031d23..b5eb9ecd33 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15380,6 +15380,10 @@ Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index af957a5ac0..7a61d7591e 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -347,6 +347,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)   += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+cuda/load_helper.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)   += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 0c6b2347c8..d65c13011c 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -333,6 +333,7 @@ extern const AVFilter ff_vf_msad;
 extern const AVFilter ff_vf_negate;
 extern const AVFilter ff_vf_nlmeans;
 extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_cuda;
 extern const AVFilter ff_vf_nnedi;
 extern const AVFilter ff_vf_noformat;
 extern const AVFilter ff_vf_noise;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index ff12ff9f8f..306bb62ff4 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -30,7 +30,7 @@
 #include "libavutil/version.h"
 
 #define LIBAVFILTER_VERSION_MAJOR   8
-#define LIBAVFILTER_VERSION_MINOR   7
+#define LIBAVFILTER_VERSION_MINOR   8
 #define LIBAVFILTER_VERSION_MICRO 100
 
 
diff --git a/libavfilter/vf_nlmeans_cuda.c b/libavfilter/vf_nlmeans_cuda.c
new file mode 100644
index 00..3dc74e310d
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.c
@@ -0,0 +1,850 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+
+#include "cuda/l

Re: [FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-09-02 Thread Timo Rothenpieler

On 02.09.2021 19:50, Dylan Fernando wrote:

I want to add support for the other formats, but I'm not sure how to find
video files to test it out. I tried looking through
https://samples.ffmpeg.org/, but I'm not sure which files on there are the
formats im looking for (AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P,
AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16).


Just slap a format_cuda filter in front and convert to the desired format.
For RGB formats, which it doesn't support right now, just use 
format,hwupload_cuda.




smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-09-02 Thread Timo Rothenpieler

On 02.09.2021 15:32, Timo Rothenpieler wrote:

On 02.09.2021 19:50, Dylan Fernando wrote:

I want to add support for the other formats, but I'm not sure how to find
video files to test it out. I tried looking through
https://samples.ffmpeg.org/, but I'm not sure which files on there are 
the

formats im looking for (AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P,
AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16).


Just slap a format_cuda filter in front and convert to the desired format.
For RGB formats, which it doesn't support right now, just use 
format,hwupload_cuda.


sorry, scale_cuda or format+hwupload


smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-09-04 Thread Dylan Fernando
On Thu, Sep 2, 2021 at 4:25 PM Timo Rothenpieler 
wrote:

> On 02.09.2021 15:32, Timo Rothenpieler wrote:
> > On 02.09.2021 19:50, Dylan Fernando wrote:
> >> I want to add support for the other formats, but I'm not sure how to
> find
> >> video files to test it out. I tried looking through
> >> https://samples.ffmpeg.org/, but I'm not sure which files on there are
> >> the
> >> formats im looking for (AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P,
> >> AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16).
> >
> > Just slap a format_cuda filter in front and convert to the desired
> format.
> > For RGB formats, which it doesn't support right now, just use
> > format,hwupload_cuda.
>
> sorry, scale_cuda or format+hwupload
> ___
> ffmpeg-devel mailing list
> [email protected]
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] with subject "unsubscribe".
>

Thanks, I have yuv420p and yuv444p working now.

For P010, I tried using:
ffmpeg -loglevel debug -v verbose -hwaccel cuda -hwaccel_output_format cuda
-i noise.mp4 -vf format=p010,hwupload,nlmeans_cuda=20,hwdownload nlmeans.mp4

and I get:
[Parsed_format_0 @ 0x558bf0ff6bc0] auto-inserting filter 'auto_scale_0'
between the filter 'graph 0 input from stream 0:0' and the filter
'Parsed_format_0'
Impossible to convert between the formats supported by the filter 'graph 0
input from stream 0:0' and the filter 'auto_scale_0'

Segmentation fault (core dumped)
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-09-04 Thread Timo Rothenpieler

On 04.09.2021 22:03, Dylan Fernando wrote:

On Thu, Sep 2, 2021 at 4:25 PM Timo Rothenpieler 
wrote:


On 02.09.2021 15:32, Timo Rothenpieler wrote:

On 02.09.2021 19:50, Dylan Fernando wrote:

I want to add support for the other formats, but I'm not sure how to

find

video files to test it out. I tried looking through
https://samples.ffmpeg.org/, but I'm not sure which files on there are
the
formats im looking for (AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P,
AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16).


Just slap a format_cuda filter in front and convert to the desired

format.

For RGB formats, which it doesn't support right now, just use
format,hwupload_cuda.


sorry, scale_cuda or format+hwupload
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".



Thanks, I have yuv420p and yuv444p working now.

For P010, I tried using:
ffmpeg -loglevel debug -v verbose -hwaccel cuda -hwaccel_output_format cuda
-i noise.mp4 -vf format=p010,hwupload,nlmeans_cuda=20,hwdownload nlmeans.mp4

and I get:
[Parsed_format_0 @ 0x558bf0ff6bc0] auto-inserting filter 'auto_scale_0'
between the filter 'graph 0 input from stream 0:0' and the filter
'Parsed_format_0'
Impossible to convert between the formats supported by the filter 'graph 0
input from stream 0:0' and the filter 'auto_scale_0'

Segmentation fault (core dumped)


you're trying to hwupload something that already is uploaded by the decoder.
Either use scale_cuda for the conversion, or don't have the decoder 
output CUDA frames.


The segfault is a bit unexpected though. Can you get a backtrace?


smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-09-05 Thread Dylan Fernando
On Sat, Sep 4, 2021 at 10:43 AM Timo Rothenpieler 
wrote:

> On 04.09.2021 22:03, Dylan Fernando wrote:
> > On Thu, Sep 2, 2021 at 4:25 PM Timo Rothenpieler 
> > wrote:
> >
> >> On 02.09.2021 15:32, Timo Rothenpieler wrote:
> >>> On 02.09.2021 19:50, Dylan Fernando wrote:
>  I want to add support for the other formats, but I'm not sure how to
> >> find
>  video files to test it out. I tried looking through
>  https://samples.ffmpeg.org/, but I'm not sure which files on there
> are
>  the
>  formats im looking for (AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P,
>  AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16).
> >>>
> >>> Just slap a format_cuda filter in front and convert to the desired
> >> format.
> >>> For RGB formats, which it doesn't support right now, just use
> >>> format,hwupload_cuda.
> >>
> >> sorry, scale_cuda or format+hwupload
> >> ___
> >> ffmpeg-devel mailing list
> >> [email protected]
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> [email protected] with subject "unsubscribe".
> >>
> >
> > Thanks, I have yuv420p and yuv444p working now.
> >
> > For P010, I tried using:
> > ffmpeg -loglevel debug -v verbose -hwaccel cuda -hwaccel_output_format
> cuda
> > -i noise.mp4 -vf format=p010,hwupload,nlmeans_cuda=20,hwdownload
> nlmeans.mp4
> >
> > and I get:
> > [Parsed_format_0 @ 0x558bf0ff6bc0] auto-inserting filter 'auto_scale_0'
> > between the filter 'graph 0 input from stream 0:0' and the filter
> > 'Parsed_format_0'
> > Impossible to convert between the formats supported by the filter 'graph
> 0
> > input from stream 0:0' and the filter 'auto_scale_0'
> >
> > Segmentation fault (core dumped)
>
> you're trying to hwupload something that already is uploaded by the
> decoder.
> Either use scale_cuda for the conversion, or don't have the decoder
> output CUDA frames.
>
> The segfault is a bit unexpected though. Can you get a backtrace?
> ___
> ffmpeg-devel mailing list
> [email protected]
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] with subject "unsubscribe".
>


Impossible to convert between the formats supported by the filter 'graph 0
input from stream 0:0' and the filter 'auto_scale_0'

Thread 1 "ffmpeg" received signal SIGSEGV, Segmentation fault.
0x556726eb in uninit (ctx=0x5817e800) at
libavfilter/vf_nlmeans_cuda.c:704
704 CudaFunctions *cu = s->hwctx->internal->cuda_dl;
(gdb) backtrace
#0  0x556726eb in uninit (ctx=0x5817e800) at
libavfilter/vf_nlmeans_cuda.c:704
#1  0x55742e65 in avfilter_free (filter=0x5817e800) at
libavfilter/avfilter.c:769
#2  0x55744cac in avfilter_graph_free (graph=0x5716ded0) at
libavfilter/avfiltergraph.c:126
#3  0x55707ae0 in cleanup_filtergraph (fg=0x5716dec0) at
fftools/ffmpeg_filter.c:952
#4  configure_filtergraph (fg=fg@entry=0x5716dec0) at
fftools/ffmpeg_filter.c:1130
#5  0x5571b060 in ifilter_send_frame (frame=0x57a72d00,
ifilter=0x5716db40) at fftools/ffmpeg.c:2242
#6  send_frame_to_filters (ist=ist@entry=0x5716c5c0,
decoded_frame=decoded_frame@entry=0x57a72d00) at fftools/ffmpeg.c:2323
#7  0x5571c204 in decode_video (decode_failed=,
eof=, duration_pts=, got_output=, pkt=, ist=)
at fftools/ffmpeg.c:2520
#8  process_input_packet (ist=ist@entry=0x5716c5c0, pkt=0x5716c7c0,
no_eof=no_eof@entry=0) at fftools/ffmpeg.c:2682
#9  0x5571daee in process_input (file_index=) at
fftools/ffmpeg.c:4636
#10 transcode_step () at fftools/ffmpeg.c:4776
#11 transcode () at fftools/ffmpeg.c:4830
#12 0x556f84a7 in main (argc=, argv=)
at fftools/ffmpeg.c:5035

>
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-09-05 Thread Timo Rothenpieler

On 05.09.2021 20:02, Dylan Fernando wrote:


Impossible to convert between the formats supported by the filter 'graph 0
input from stream 0:0' and the filter 'auto_scale_0'

Thread 1 "ffmpeg" received signal SIGSEGV, Segmentation fault.
0x556726eb in uninit (ctx=0x5817e800) at
libavfilter/vf_nlmeans_cuda.c:704
704 CudaFunctions *cu = s->hwctx->internal->cuda_dl;


Seems like some error state can cause uninit to be called before the 
hwctx is properly set up.

So it needs a guard against that case.


(gdb) backtrace
#0  0x556726eb in uninit (ctx=0x5817e800) at
libavfilter/vf_nlmeans_cuda.c:704
#1  0x55742e65 in avfilter_free (filter=0x5817e800) at
libavfilter/avfilter.c:769
#2  0x55744cac in avfilter_graph_free (graph=0x5716ded0) at
libavfilter/avfiltergraph.c:126
#3  0x55707ae0 in cleanup_filtergraph (fg=0x5716dec0) at
fftools/ffmpeg_filter.c:952
#4  configure_filtergraph (fg=fg@entry=0x5716dec0) at
fftools/ffmpeg_filter.c:1130
#5  0x5571b060 in ifilter_send_frame (frame=0x57a72d00,
ifilter=0x5716db40) at fftools/ffmpeg.c:2242
#6  send_frame_to_filters (ist=ist@entry=0x5716c5c0,
decoded_frame=decoded_frame@entry=0x57a72d00) at fftools/ffmpeg.c:2323
#7  0x5571c204 in decode_video (decode_failed=,
eof=, duration_pts=, got_output=, pkt=, ist=)
 at fftools/ffmpeg.c:2520
#8  process_input_packet (ist=ist@entry=0x5716c5c0, pkt=0x5716c7c0,
no_eof=no_eof@entry=0) at fftools/ffmpeg.c:2682
#9  0x5571daee in process_input (file_index=) at
fftools/ffmpeg.c:4636
#10 transcode_step () at fftools/ffmpeg.c:4776
#11 transcode () at fftools/ffmpeg.c:4830
#12 0x556f84a7 in main (argc=, argv=)
at fftools/ffmpeg.c:5035




smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3] lavfi: add nlmeans CUDA filter

2021-09-08 Thread Dylan Fernando

From f8b8a250a2c4b092747d632adc7dafccfc474140 Mon Sep 17 00:00:00 2001
From: Dylan Fernando 
Date: Wed, 8 Sep 2021 18:19:40 +
Subject: [PATCH] lavfi: add nlmeans_cuda filter

Signed-off-by: Dylan Fernando 
---
 configure  |   2 +
 doc/filters.texi   |   4 +
 libavfilter/Makefile   |   2 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/version.h  |   4 +-
 libavfilter/vf_nlmeans_cuda.c  | 883 +
 libavfilter/vf_nlmeans_cuda.cu | 378 ++
 7 files changed, 1272 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

diff --git a/configure b/configure
index af410a9d11..7fa67e415e 100755
--- a/configure
+++ b/configure
@@ -3094,6 +3094,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index 9ad6031d23..b5eb9ecd33 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15380,6 +15380,10 @@ Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index af957a5ac0..7a61d7591e 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -347,6 +347,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)   += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+cuda/load_helper.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)   += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 0c6b2347c8..d65c13011c 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -333,6 +333,7 @@ extern const AVFilter ff_vf_msad;
 extern const AVFilter ff_vf_negate;
 extern const AVFilter ff_vf_nlmeans;
 extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_cuda;
 extern const AVFilter ff_vf_nnedi;
 extern const AVFilter ff_vf_noformat;
 extern const AVFilter ff_vf_noise;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index 2110048b77..306bb62ff4 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -30,8 +30,8 @@
 #include "libavutil/version.h"
 
 #define LIBAVFILTER_VERSION_MAJOR   8
-#define LIBAVFILTER_VERSION_MINOR   7
-#define LIBAVFILTER_VERSION_MICRO 101
+#define LIBAVFILTER_VERSION_MINOR   8
+#define LIBAVFILTER_VERSION_MICRO 100
 
 
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
diff --git a/libavfilter/vf_nlmeans_cuda.c b/libavfilter/vf_nlmeans_cuda.c
new file mode 100644
index 00..3ecc7c8945
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.c
@@ -0,0 +1,883 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+AV_PIX_FMT_NV12,
+AV_PIX_FMT_YUV420P,
+AV_PIX_FMT_YUV444P
+};
+
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->in

Re: [FFmpeg-devel] [PATCH v3] lavfi: add nlmeans CUDA filter

2021-09-22 Thread Dylan Fernando
Any feedback for this?
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3] lavfi: add nlmeans CUDA filter

2021-11-05 Thread Dylan Fernando
nlmeans CUDA filter
From f4c332c308865a33d42c8a2dfe251625506a30f0 Mon Sep 17 00:00:00 2001
From: Dylan Fernando 
Date: Sat, 6 Nov 2021 10:30:15 +
Subject: [PATCH] lavfi: add nlmeans_cuda filter

Signed-off-by: Dylan Fernando 
---
 configure  |   2 +
 doc/filters.texi   |   4 +
 libavfilter/Makefile   |   2 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/version.h  |   4 +-
 libavfilter/vf_nlmeans_cuda.c  | 883 +
 libavfilter/vf_nlmeans_cuda.cu | 378 ++
 7 files changed, 1272 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

diff --git a/configure b/configure
index c01aa480c7..ac756ef630 100755
--- a/configure
+++ b/configure
@@ -3103,6 +3103,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 amf_deps_any="libdl LoadLibrary"
diff --git a/doc/filters.texi b/doc/filters.texi
index b537e421be..52e6208710 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15668,6 +15668,10 @@ Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 552bd4e286..9e8f42c176 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -354,6 +354,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_negate.o
 OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)   += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+cuda/load_helper.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)   += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 667b6fc246..8f812dcd87 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -340,6 +340,7 @@ extern const AVFilter ff_vf_msad;
 extern const AVFilter ff_vf_negate;
 extern const AVFilter ff_vf_nlmeans;
 extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_cuda;
 extern const AVFilter ff_vf_nnedi;
 extern const AVFilter ff_vf_noformat;
 extern const AVFilter ff_vf_noise;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index 3bd3816698..cb831b4a1c 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -30,8 +30,8 @@
 #include "libavutil/version.h"
 
 #define LIBAVFILTER_VERSION_MAJOR   8
-#define LIBAVFILTER_VERSION_MINOR  16
-#define LIBAVFILTER_VERSION_MICRO 101
+#define LIBAVFILTER_VERSION_MINOR  17
+#define LIBAVFILTER_VERSION_MICRO 100
 
 
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
diff --git a/libavfilter/vf_nlmeans_cuda.c b/libavfilter/vf_nlmeans_cuda.c
new file mode 100644
index 00..1c838a8af1
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.c
@@ -0,0 +1,883 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+AV_PIX_FMT_NV12,
+AV_PIX_FMT_YUV420P,
+AV_PIX_FMT_YUV444P
+};
+
+
+#define CHECK

Re: [FFmpeg-devel] [PATCH v3] lavfi: add nlmeans CUDA filter

2021-11-05 Thread Dylan Fernando
On Sat, Nov 6, 2021 at 10:34 AM Dylan Fernando  wrote:

> nlmeans CUDA filter
>
>
>
> removed query_formats
>
From 45dcc1bdc00657bb3613500e131b6fdeb64ac318 Mon Sep 17 00:00:00 2001
From: Dylan Fernando 
Date: Sat, 6 Nov 2021 17:33:48 +
Subject: [PATCH] lavfi: add nlmeans_cuda filter

Signed-off-by: Dylan Fernando 
---
 configure  |   2 +
 doc/filters.texi   |   4 +
 libavfilter/Makefile   |   2 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/version.h  |   4 +-
 libavfilter/vf_nlmeans_cuda.c  | 871 +
 libavfilter/vf_nlmeans_cuda.cu | 378 ++
 7 files changed, 1260 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

diff --git a/configure b/configure
index c01aa480c7..ac756ef630 100755
--- a/configure
+++ b/configure
@@ -3103,6 +3103,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 amf_deps_any="libdl LoadLibrary"
diff --git a/doc/filters.texi b/doc/filters.texi
index b537e421be..52e6208710 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15668,6 +15668,10 @@ Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 552bd4e286..9e8f42c176 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -354,6 +354,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_negate.o
 OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)   += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+cuda/load_helper.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)   += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 667b6fc246..8f812dcd87 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -340,6 +340,7 @@ extern const AVFilter ff_vf_msad;
 extern const AVFilter ff_vf_negate;
 extern const AVFilter ff_vf_nlmeans;
 extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_cuda;
 extern const AVFilter ff_vf_nnedi;
 extern const AVFilter ff_vf_noformat;
 extern const AVFilter ff_vf_noise;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index 3bd3816698..cb831b4a1c 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -30,8 +30,8 @@
 #include "libavutil/version.h"
 
 #define LIBAVFILTER_VERSION_MAJOR   8
-#define LIBAVFILTER_VERSION_MINOR  16
-#define LIBAVFILTER_VERSION_MICRO 101
+#define LIBAVFILTER_VERSION_MINOR  17
+#define LIBAVFILTER_VERSION_MICRO 100
 
 
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
diff --git a/libavfilter/vf_nlmeans_cuda.c b/libavfilter/vf_nlmeans_cuda.c
new file mode 100644
index 00..cece797e15
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.c
@@ -0,0 +1,871 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[]

Re: [FFmpeg-devel] [PATCH v3] lavfi: add nlmeans CUDA filter

2021-11-16 Thread Dylan Fernando
On Sun, 7 Nov 2021 at 4:38 am, Dylan Fernando  wrote:

>
>
> On Sat, Nov 6, 2021 at 10:34 AM Dylan Fernando 
> wrote:
>
>> nlmeans CUDA filter
>>
>>
>>
>> removed query_formats
>>
> Anybody have any feedback for this?
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-11-27 Thread Dylan Fernando
On Sun, Sep 5, 2021 at 8:37 AM Timo Rothenpieler 
wrote:

> On 05.09.2021 20:02, Dylan Fernando wrote:
> >
> > Impossible to convert between the formats supported by the filter 'graph
> 0
> > input from stream 0:0' and the filter 'auto_scale_0'
> >
> > Thread 1 "ffmpeg" received signal SIGSEGV, Segmentation fault.
> > 0x556726eb in uninit (ctx=0x5817e800) at
> > libavfilter/vf_nlmeans_cuda.c:704
> > 704 CudaFunctions *cu = s->hwctx->internal->cuda_dl;
>
> Seems like some error state can cause uninit to be called before the
> hwctx is properly set up.
> So it needs a guard against that case.
>
> > (gdb) backtrace
> > #0  0x556726eb in uninit (ctx=0x5817e800) at
> > libavfilter/vf_nlmeans_cuda.c:704
> > #1  0x55742e65 in avfilter_free (filter=0x5817e800) at
> > libavfilter/avfilter.c:769
> > #2  0x55744cac in avfilter_graph_free (graph=0x5716ded0) at
> > libavfilter/avfiltergraph.c:126
> > #3  0x55707ae0 in cleanup_filtergraph (fg=0x5716dec0) at
> > fftools/ffmpeg_filter.c:952
> > #4  configure_filtergraph (fg=fg@entry=0x5716dec0) at
> > fftools/ffmpeg_filter.c:1130
> > #5  0x5571b060 in ifilter_send_frame (frame=0x57a72d00,
> > ifilter=0x5716db40) at fftools/ffmpeg.c:2242
> > #6  send_frame_to_filters (ist=ist@entry=0x5716c5c0,
> > decoded_frame=decoded_frame@entry=0x57a72d00) at
> fftools/ffmpeg.c:2323
> > #7  0x5571c204 in decode_video (decode_failed=,
> > eof=, duration_pts=, got_output= > out>, pkt=, ist=)
> >  at fftools/ffmpeg.c:2520
> > #8  process_input_packet (ist=ist@entry=0x5716c5c0,
> pkt=0x5716c7c0,
> > no_eof=no_eof@entry=0) at fftools/ffmpeg.c:2682
> > #9  0x5571daee in process_input (file_index=) at
> > fftools/ffmpeg.c:4636
> > #10 transcode_step () at fftools/ffmpeg.c:4776
> > #11 transcode () at fftools/ffmpeg.c:4830
> > #12 0x556f84a7 in main (argc=, argv= out>)
> > at fftools/ffmpeg.c:5035
>
> ___
> ffmpeg-devel mailing list
> [email protected]
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] with subject "unsubscribe".
>
>
> attatched updated patch
>
From 32239fc2225d4ff5cd8d7d3d20764d34b26e3a92 Mon Sep 17 00:00:00 2001
From: Dylan Fernando 
Date: Sun, 28 Nov 2021 17:42:55 +
Subject: [PATCH] lavfi: add nlmeans_cuda filter

Signed-off-by: Dylan Fernando 
---
 configure  |   2 +
 doc/filters.texi   |   4 +
 libavfilter/Makefile   |   2 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/version.h  |   2 +-
 libavfilter/vf_nlmeans_cuda.c  | 871 +
 libavfilter/vf_nlmeans_cuda.cu | 378 ++
 7 files changed, 1259 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

diff --git a/configure b/configure
index 4af36bf80a..1ec0453f44 100755
--- a/configure
+++ b/configure
@@ -3115,6 +3115,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 amf_deps_any="libdl LoadLibrary"
diff --git a/doc/filters.texi b/doc/filters.texi
index 3731a14521..8f6c0ce22c 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15698,6 +15698,10 @@ Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 0e27aeeff6..24d41567c5 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -357,6 +357,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER) += vf_negate.o
 OBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)   += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+cuda/load_helper.o
 OBJS-$(CONFIG_NNEDI_FILTER)  += vf_

Re: [FFmpeg-devel] [PATCH v2] lavfi: add nlmeans CUDA filter

2021-11-28 Thread Timo Rothenpieler

+for (i = 0; i < nb_pixel / 4; i++) {
+
+int *dx_cur = dxdy + 8 * i;
+int *dy_cur = dxdy + 8 * i + 4;
+
+call_horiz(ctx, 1, src_dptr, src_width, src_height, src_pitch,
+   integ_img, dx_cur, dy_cur, pixel_size);
+
+call_vert(ctx, 1, src_width, src_height, integ_img, pixel_size);
+
+call_weight(ctx, 1, src_dptr, src_width, src_height, src_pitch, integ_img, 
(float*)s->sum, (float*)s->weight, p, dx_cur, dy_cur, pixel_size);
+}
+
+call_average(ctx, 1, src_dptr, src_width, src_height, src_pitch, 
(float*)s->sum, (float*)s->weight,
+   dst_dptr, dst_width, dst_height, dst_pitch, pixel_size);


My immediate thought when seeing that block is "move this all to the 
CUDA side", but you're calling all those with different block layouts?


I don't understand the algorithm well enough, so I guess this is necessary.

How well does it perform? All those jumps between C and CUDA code come 
at an overhead.



Some other nits:
I'm not a fan of a functions just called "init", "uninit" and so on. 
It's not wrong, given it's static, but it's usually nicer to give all 
functions a common prefix. "cunlmeans_" or something like that.


What's up with that if(!s->initialised) block in filter_frame? I would 
have thought it's logically impossible that it gets that far without 
init being called?




Otherwise, the filter looks fine to me.


smime.p7s
Description: S/MIME Cryptographic Signature
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH] tests/checkasm/nlmeans: Add check for av_calloc

2022-02-16 Thread Jiasheng Jiang
As the potential failure of the av_calloc(), it should be better
to check it and fail() if fails in order to avoid the dereference
of the NULL pointer.

Fixes: f679711c1b ("checkasm: add vf_nlmeans test for ssd_integral_image")
Signed-off-by: Jiasheng Jiang 
---
 tests/checkasm/vf_nlmeans.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/vf_nlmeans.c b/tests/checkasm/vf_nlmeans.c
index 87474d6803..82370bbeec 100644
--- a/tests/checkasm/vf_nlmeans.c
+++ b/tests/checkasm/vf_nlmeans.c
@@ -47,9 +47,9 @@ void checkasm_check_nlmeans(void)
 const int ii_h = h + e*2;
 const int ii_lz_32 = FFALIGN(ii_w + 1, 4);
 uint32_t *ii_orig_ref = av_calloc(ii_h + 1, ii_lz_32 * 
sizeof(*ii_orig_ref));
-uint32_t *ii_ref = ii_orig_ref + ii_lz_32 + 1;
+uint32_t *ii_ref;
 uint32_t *ii_orig_new = av_calloc(ii_h + 1, ii_lz_32 * 
sizeof(*ii_orig_new));
-uint32_t *ii_new = ii_orig_new + ii_lz_32 + 1;
+uint32_t *ii_new;
 const int src_lz = FFALIGN(w, 16);
 uint8_t *src = av_calloc(h, src_lz);
 
@@ -58,6 +58,16 @@ void checkasm_check_nlmeans(void)
  const uint8_t *s2, ptrdiff_t linesize2,
  int w, int h);
 
+if (!ii_orig_ref || !ii_orig_new || !src) {
+av_free(ii_orig_ref);
+av_free(ii_orig_new);
+av_free(src);
+fail();
+}
+
+ii_ref = ii_orig_ref + ii_lz_32 + 1;
+ii_new = ii_orig_new + ii_lz_32 + 1;
+
 randomize_buffer(src, h * src_lz);
 
 for (offy = -r; offy <= r; offy++) {
-- 
2.25.1

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".