Hi,

patch attached.
From a41c741bb4d3146661e629552d750638a0bdc87c Mon Sep 17 00:00:00 2001
From: Paul B Mahol <one...@gmail.com>
Date: Sat, 23 Jan 2016 17:15:53 +0100
Subject: [PATCH] avfilter: add nnedi filter

Port of nnedi3 vapoursynth filter.

Signed-off-by: Paul B Mahol <one...@gmail.com>
---
 configure                |   1 +
 libavfilter/Makefile     |   1 +
 libavfilter/allfilters.c |   1 +
 libavfilter/vf_nnedi.c   | 939 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 942 insertions(+)
 create mode 100644 libavfilter/vf_nnedi.c

diff --git a/configure b/configure
index 8f4642b..09de960 100755
--- a/configure
+++ b/configure
@@ -2880,6 +2880,7 @@ mpdecimate_filter_deps="gpl"
 mpdecimate_filter_select="pixelutils"
 mptestsrc_filter_deps="gpl"
 negate_filter_deps="lut_filter"
+nnedi_filter_deps="gpl"
 perspective_filter_deps="gpl"
 pp7_filter_deps="gpl"
 ocr_filter_deps="libtesseract"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index b93e5f2..e76d18e 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -187,6 +187,7 @@ OBJS-$(CONFIG_MCDEINT_FILTER)                += vf_mcdeint.o
 OBJS-$(CONFIG_MERGEPLANES_FILTER)            += vf_mergeplanes.o framesync.o
 OBJS-$(CONFIG_MPDECIMATE_FILTER)             += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o
+OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
 OBJS-$(CONFIG_NULL_FILTER)                   += vf_null.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 1d48970..27d54bc 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -208,6 +208,7 @@ void avfilter_register_all(void)
     REGISTER_FILTER(MERGEPLANES,    mergeplanes,    vf);
     REGISTER_FILTER(MPDECIMATE,     mpdecimate,     vf);
     REGISTER_FILTER(NEGATE,         negate,         vf);
+    REGISTER_FILTER(NNEDI,          nnedi,          vf);
     REGISTER_FILTER(NOFORMAT,       noformat,       vf);
     REGISTER_FILTER(NOISE,          noise,          vf);
     REGISTER_FILTER(NULL,           null,           vf);
diff --git a/libavfilter/vf_nnedi.c b/libavfilter/vf_nnedi.c
new file mode 100644
index 0000000..582ffe0
--- /dev/null
+++ b/libavfilter/vf_nnedi.c
@@ -0,0 +1,939 @@
+/*
+ * Copyright (C) 2010-2011 Kevin Stone
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <float.h>
+
+#include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+typedef struct FrameData {
+    uint8_t *paddedp[3];
+    int padded_stride[3];
+    int padded_width[3];
+    int padded_height[3];
+
+    uint8_t *dstp[3];
+    int dst_stride[3];
+
+    int field[3];
+
+    int32_t *lcount[3];
+    float *input;
+    float *temp;
+} FrameData;
+
+typedef struct NNEDIContext {
+    const AVClass *class;
+
+    int fieldbased;
+    char *weights_file;
+
+    AVFloatDSPContext *fdsp;
+    int nb_planes;
+    int linesize[4];
+    int planeheight[4];
+
+    float *weights0;
+    float *weights1[2];
+    int asize;
+    int nns;
+    int xdia;
+    int ydia;
+
+    // Parameters.
+    int field;
+    int dh; // double height
+    int process_plane;
+    int nsize;
+    int nnsparam;
+    int qual;
+    int etype;
+    int pscrn;
+    int fapprox;
+
+    int max_value;
+
+    void (*copyPad)(const AVFrame *, FrameData *, struct NNEDIContext *, int);
+    void (*evalFunc_0)(struct NNEDIContext *, FrameData *);
+    void (*evalFunc_1)(struct NNEDIContext *, FrameData *);
+
+    // Functions used in evalFunc_0
+    void (*readPixels)(const uint8_t *, const int, float *);
+    void (*computeNetwork0)(struct NNEDIContext *s, const float *, const float *, uint8_t *);
+    int32_t (*processLine0)(const uint8_t *, int, uint8_t *, const uint8_t *, const int, const int, const int);
+
+    // Functions used in evalFunc_1
+    void (*extract)(const uint8_t *, const int, const int, const int, float *, float *);
+    void (*dotProd)(struct NNEDIContext *, const float *, const float *, float *, const int, const int, const float *);
+    void (*expfunc)(float *, const int);
+    void (*wae5)(const float *, const int, float *);
+} NNEDIContext;
+
+#define OFFSET(x) offsetof(NNEDIContext, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
+
+static const AVOption nnedi_options[] = {
+    {"weights_file",  NULL, OFFSET(weights_file),  AV_OPT_TYPE_STRING, {.str="nnedi3_weights.bin"}, 0, 0, FLAGS },
+    {"planes",        NULL, OFFSET(process_plane), AV_OPT_TYPE_INT, {.i64=7}, 0, 7, FLAGS },
+    {"nsize",         NULL, OFFSET(nsize),         AV_OPT_TYPE_INT, {.i64=6}, 0, 6, FLAGS },
+    {"nns",           NULL, OFFSET(nnsparam),      AV_OPT_TYPE_INT, {.i64=1}, 0, 4, FLAGS },
+    {"qual",          NULL, OFFSET(qual),          AV_OPT_TYPE_INT, {.i64=1}, 1, 2, FLAGS },
+    {"etype",         NULL, OFFSET(etype),         AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS },
+    {"pscrn",         NULL, OFFSET(pscrn),         AV_OPT_TYPE_INT, {.i64=2}, 0, 2, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(nnedi);
+
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    NNEDIContext *s = ctx->priv;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
+    int ret;
+
+    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
+    if ((ret = av_image_fill_linesizes(s->linesize, inlink->format, inlink->w)) < 0)
+        return ret;
+
+    s->planeheight[1] = s->planeheight[2] = FF_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
+    s->planeheight[0] = s->planeheight[3] = inlink->h;
+
+    return 0;
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P,
+        AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P,
+        AV_PIX_FMT_YUVJ411P,
+        AV_PIX_FMT_GBRP,
+        AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_NONE
+    };
+
+    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
+    if (!fmts_list)
+        return AVERROR(ENOMEM);
+    return ff_set_common_formats(ctx, fmts_list);
+}
+
+static void copyPad(const AVFrame *src, FrameData *frameData, NNEDIContext *d, int fn)
+{
+    const int off = 1 - fn;
+
+    for (int plane = 0; plane < d->nb_planes; ++plane) {
+        if (!(d->process_plane & (1 << plane)))
+            continue;
+
+        const uint8_t *srcp = (const uint8_t *)src->data[plane];
+        uint8_t *dstp = (uint8_t *)frameData->paddedp[plane];
+
+        const int src_stride = src->linesize[plane];
+        const int dst_stride = frameData->padded_stride[plane];
+
+        const int src_height = d->planeheight[plane];
+        const int dst_height = frameData->padded_height[plane];
+
+        const int src_width = d->linesize[plane];
+        const int dst_width = frameData->padded_width[plane];
+
+        // Copy.
+        if (!d->dh) {
+            for (int y = off; y < src_height; y += 2)
+                memcpy(dstp + 32 + (6 + y) * dst_stride,
+                       srcp + y * src_stride,
+                       src_width * sizeof(uint8_t));
+        } else {
+            for (int y = 0; y < src_height; y++)
+                memcpy(dstp + 32 + (6 + y * 2 + off) * dst_stride,
+                       srcp + y * src_stride,
+                       src_width * sizeof(uint8_t));
+        }
+
+        // And pad.
+        dstp += (6 + off) * dst_stride;
+        for (int y = 6 + off; y < dst_height - 6; y += 2) {
+            for (int x = 0; x < 32; ++x)
+                dstp[x] = dstp[64 - x];
+
+            int c = 2;
+            for (int x = dst_width - 32; x < dst_width; ++x, c += 2)
+                dstp[x] = dstp[x - c];
+
+            dstp += dst_stride * 2;
+        }
+
+        dstp = (uint8_t *)frameData->paddedp[plane];
+        for (int y = off; y < 6; y += 2)
+            memcpy(dstp + y * dst_stride,
+                   dstp + (12 + 2 * off - y) * dst_stride,
+                   dst_width * sizeof(uint8_t));
+
+        int c = 4;
+        for (int y = dst_height - 6 + off; y < dst_height; y += 2, c += 4)
+            memcpy(dstp + y * dst_stride,
+                   dstp + (y - c) * dst_stride,
+                   dst_width * sizeof(uint8_t));
+    }
+}
+
+static void elliott_C(float *data, const int n)
+{
+    for (int i = 0; i < n; ++i)
+        data[i] = data[i] / (1.0f + FFABS(data[i]));
+}
+
+static void dotProd_C(NNEDIContext *s, const float *data, const float *weights, float *vals, const int n, const int len, const float *scale)
+{
+    for (int i = 0; i < n; ++i) {
+        float sum;
+
+        sum = s->fdsp->scalarproduct_float(data, &weights[i * len], len);
+
+        vals[i] = sum * scale[0] + weights[n * len + i];
+    }
+}
+
+static void dotProdS_C(NNEDIContext *s, const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *scale)
+{
+    const int16_t *data = (int16_t *)dataf;
+    const int16_t *weights = (int16_t *)weightsf;
+    const float *wf = (float *)&weights[n * len];
+
+    for (int i = 0; i < n; ++i) {
+        int sum = 0, off = ((i >> 2) << 3) + (i & 3);
+        for (int j = 0; j < len; ++j)
+            sum += data[j] * weights[i * len + j];
+
+        vals[i] = sum * wf[off] * scale[0] + wf[off + 4];
+    }
+}
+
+static void computeNetwork0_C(NNEDIContext *s, const float *input, const float *weights, uint8_t *d)
+{
+    float temp[12], scale = 1.0f;
+    dotProd_C(s, input, weights, temp, 4, 48, &scale);
+    const float t = temp[0];
+    elliott_C(temp, 4);
+    temp[0] = t;
+    dotProd_C(s, temp, weights + 4 * 49, temp + 4, 4, 4, &scale);
+    elliott_C(temp + 4, 4);
+    dotProd_C(s, temp, weights + 4 * 49 + 4 * 5, temp + 8, 4, 8, &scale);
+    if (FFMAX(temp[10], temp[11]) <= FFMAX(temp[8], temp[9]))
+        d[0] = 1;
+    else
+        d[0] = 0;
+}
+
+static void computeNetwork0_i16_C(NNEDIContext *s, const float *inputf, const float *weightsf, uint8_t *d)
+{
+    const float *wf = weightsf + 2 * 48;
+    float temp[12], scale = 1.0f;
+    dotProdS_C(s, inputf, weightsf, temp, 4, 48, &scale);
+    const float t = temp[0];
+    elliott_C(temp, 4);
+    temp[0] = t;
+    dotProd_C(s, temp, wf + 8, temp + 4, 4, 4, &scale);
+    elliott_C(temp + 4, 4);
+    dotProd_C(s, temp, wf + 8 + 4 * 5, temp + 8, 4, 8, &scale);
+    if (FFMAX(temp[10], temp[11]) <= FFMAX(temp[8], temp[9]))
+        d[0] = 1;
+    else
+        d[0] = 0;
+}
+
+static void pixel2float48_C(const uint8_t *t8, const int pitch, float *p)
+{
+    const uint8_t *t = (const uint8_t *)t8;
+
+    for (int y = 0; y < 4; ++y)
+        for (int x = 0; x < 12; ++x)
+            p[y * 12 + x] = t[y * pitch * 2 + x];
+}
+
+static void byte2word48_C(const uint8_t *t, const int pitch, float *pf)
+{
+    int16_t *p = (int16_t *)pf;
+    for (int y = 0; y < 4; ++y)
+        for (int x = 0; x < 12; ++x)
+            p[y * 12 + x] = t[y * pitch * 2 + x];
+}
+
+static int32_t processLine0_C(const uint8_t *tempu, int width, uint8_t *dstp8, const uint8_t *src3p8, const int src_pitch, const int max_value, const int chroma)
+{
+    uint8_t *dstp = (uint8_t *)dstp8;
+    const uint8_t *src3p = (const uint8_t *)src3p8;
+
+    int minimum = 0;
+    int maximum = max_value - 1;
+    // Technically the -1 is only needed for 8 and 16 bit input.
+
+    int count = 0;
+    for (int x = 0; x < width; ++x) {
+        if (tempu[x]) {
+            int tmp = 19 * (src3p[x + src_pitch * 2] + src3p[x + src_pitch * 4]) - 3 * (src3p[x] + src3p[x + src_pitch * 6]);
+            tmp /= 32;
+            dstp[x] = FFMAX(FFMIN(tmp, maximum), minimum);
+        } else {
+            memset(dstp + x, 255, sizeof(uint8_t));
+            ++count;
+        }
+    }
+    return count;
+}
+
+// new prescreener functions
+static void byte2word64_C(const uint8_t *t, const int pitch, float *p) {
+    int16_t *ps = (int16_t *)p;
+    for (int y = 0; y < 4; ++y)
+        for (int x = 0; x < 16; ++x)
+            ps[y * 16 + x] = t[y * pitch * 2 + x];
+}
+
+static void computeNetwork0new_C(NNEDIContext *s, const float *datai, const float *weights, uint8_t *d) {
+    int16_t *data = (int16_t *)datai;
+    int16_t *ws = (int16_t *)weights;
+    float *wf = (float *)&ws[4 * 64];
+    float vals[8];
+    for (int i = 0; i < 4; ++i) {
+        int sum = 0;
+        for (int j = 0; j < 64; ++j)
+            sum += data[j] * ws[(i << 3) + ((j >> 3) << 5) + (j & 7)];
+        const float t = sum * wf[i] + wf[4 + i];
+        vals[i] = t / (1.0f + FFABS(t));
+    }
+    for (int i = 0; i < 4; ++i) {
+        float sum = 0.0f;
+        for (int j = 0; j < 4; ++j)
+            sum += vals[j] * wf[8 + i + (j << 2)];
+        vals[4 + i] = sum + wf[8 + 16 + i];
+    }
+    int mask = 0;
+    for (int i = 0; i < 4; ++i) {
+        if (vals[4 + i] > 0.0f)
+            mask |= (0x1 << (i << 3));
+    }
+    ((int *)d)[0] = mask;
+}
+
+static void evalFunc_0(NNEDIContext *d, FrameData *frameData)
+{
+    float *input = frameData->input;
+    const float *weights0 = d->weights0;
+    float *temp = frameData->temp;
+    uint8_t *tempu = (uint8_t *)temp;
+
+    // And now the actual work.
+    for (int plane = 0; plane < d->nb_planes; ++plane) {
+        if (!(d->process_plane & (1 << plane)))
+            continue;
+
+        const uint8_t *srcp = (const uint8_t *)frameData->paddedp[plane];
+        const int src_stride = frameData->padded_stride[plane] / sizeof(uint8_t);
+
+        const int width = frameData->padded_width[plane];
+        const int height = frameData->padded_height[plane];
+
+        uint8_t *dstp = (uint8_t *)frameData->dstp[plane];
+        const int dst_stride = frameData->dst_stride[plane] / sizeof(uint8_t);
+
+        for (int y = 1 - frameData->field[plane]; y < height - 12; y += 2)
+            memcpy(dstp + y * dst_stride,
+                   srcp + 32 + (6 + y) * src_stride,
+                   (width - 64) * sizeof(uint8_t));
+
+        const int ystart = 6 + frameData->field[plane];
+        const int ystop = height - 6;
+        srcp += ystart * src_stride;
+        dstp += (ystart - 6) * dst_stride - 32;
+        const uint8_t *src3p = srcp - src_stride * 3;
+        int32_t *lcount = frameData->lcount[plane] - 6;
+        if (d->pscrn == 1) {// original
+            for (int y = ystart; y < ystop; y += 2) {
+                for (int x = 32; x < width - 32; ++x) {
+                    d->readPixels((const uint8_t *)(src3p + x - 5), src_stride, input);
+                    d->computeNetwork0(d, input, weights0, tempu+x);
+                }
+                lcount[y] += d->processLine0(tempu + 32, width - 64, (uint8_t *)(dstp + 32), (const uint8_t *)(src3p + 32), src_stride, d->max_value, plane);
+                src3p += src_stride * 2;
+                dstp += dst_stride * 2;
+            }
+        } else if (sizeof(uint8_t) == 1 && d->pscrn >= 2) {// new
+            for (int y = ystart; y < ystop; y += 2) {
+                for (int x = 32; x < width - 32; x += 4) {
+                    d->readPixels((const uint8_t *)(src3p + x - 6), src_stride, input);
+                    d->computeNetwork0(d, input, weights0, tempu + x);
+                }
+                lcount[y] += d->processLine0(tempu + 32, width - 64, (uint8_t *)(dstp + 32), (const uint8_t *)(src3p + 32), src_stride, d->max_value, plane);
+                src3p += src_stride * 2;
+                dstp += dst_stride * 2;
+            }
+        } else {// no prescreening
+            for (int y = ystart; y < ystop; y += 2) {
+                memset(dstp + 32, 255, (width - 64) * sizeof(uint8_t));
+                lcount[y] += width - 64;
+                dstp += dst_stride * 2;
+            }
+        }
+    }
+}
+
+static void extract_m8_C(const uint8_t *srcp8, const int stride, const int xdia, const int ydia, float *mstd, float *input) {
+    // uint8_t or uint16_t or float
+    const uint8_t *srcp = (const uint8_t *)srcp8;
+
+    // int32_t or int64_t or double
+    int64_t sum = 0, sumsq = 0;
+    for (int y = 0; y < ydia; ++y) {
+        const uint8_t *srcpT = srcp + y * stride * 2;
+        for (int x = 0; x < xdia; ++x) {
+            sum += srcpT[x];
+            sumsq += (uint32_t)srcpT[x] * (uint32_t)srcpT[x];
+            input[x] = srcpT[x];
+        }
+        input += xdia;
+    }
+    const float scale = 1.0f / (xdia * ydia);
+    mstd[0] = sum * scale;
+    const double tmp = (double)sumsq * scale - (double)mstd[0] * mstd[0];
+    mstd[3] = 0.0f;
+    if (tmp <= FLT_EPSILON)
+        mstd[1] = mstd[2] = 0.0f;
+    else {
+        mstd[1] = sqrt(tmp);
+        mstd[2] = 1.0f / mstd[1];
+    }
+}
+
+static void extract_m8_i16_C(const uint8_t *srcp, const int stride, const int xdia, const int ydia, float *mstd, float *inputf) {
+    int16_t *input = (int16_t *)inputf;
+    int sum = 0, sumsq = 0;
+    for (int y = 0; y < ydia; ++y) {
+        const uint8_t *srcpT = srcp + y * stride * 2;
+        for (int x = 0; x < xdia; ++x) {
+            sum += srcpT[x];
+            sumsq += srcpT[x] * srcpT[x];
+            input[x] = srcpT[x];
+        }
+        input += xdia;
+    }
+    const float scale = 1.0f / (float)(xdia * ydia);
+    mstd[0] = sum * scale;
+    mstd[1] = sumsq * scale - mstd[0] * mstd[0];
+    mstd[3] = 0.0f;
+    if (mstd[1] <= FLT_EPSILON)
+        mstd[1] = mstd[2] = 0.0f;
+    else {
+        mstd[1] = sqrt(mstd[1]);
+        mstd[2] = 1.0f / mstd[1];
+    }
+}
+
+
+const float exp_lo = -80.0f;
+const float exp_hi = +80.0f;
+
+static void e2_m16_C(float *s, const int n)
+{
+    for (int i = 0; i < n; ++i)
+        s[i] = exp(FFMAX(FFMIN(s[i], exp_hi), exp_lo));
+}
+
+const float min_weight_sum = 1e-10f;
+
+static void weightedAvgElliottMul5_m16_C(const float *w, const int n, float *mstd) {
+    float vsum = 0.0f, wsum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        vsum += w[i] * (w[n + i] / (1.0f + FFABS(w[n + i])));
+        wsum += w[i];
+    }
+    if (wsum > min_weight_sum)
+        mstd[3] += ((5.0f * vsum) / wsum) * mstd[1] + mstd[0];
+    else
+        mstd[3] += mstd[0];
+}
+
+
+static void evalFunc_1(NNEDIContext *d, FrameData *frameData)
+{
+    float *input = frameData->input;
+    float *temp = frameData->temp;
+    float **weights1 = d->weights1;
+    const int qual = d->qual;
+    const int asize = d->asize;
+    const int nns = d->nns;
+    const int xdia = d->xdia;
+    const int xdiad2m1 = (xdia / 2) - 1;
+    const int ydia = d->ydia;
+    const float scale = 1.0f / (float)qual;
+
+    for (int plane = 0; plane < d->nb_planes; ++plane) {
+        if (!(d->process_plane & (1 << plane)))
+            continue;
+
+        const uint8_t *srcp = (const uint8_t *)frameData->paddedp[plane];
+        const int src_stride = frameData->padded_stride[plane] / sizeof(uint8_t);
+
+        const int width = frameData->padded_width[plane];
+        const int height = frameData->padded_height[plane];
+
+        uint8_t *dstp = (uint8_t *)frameData->dstp[plane];
+        const int dst_stride = frameData->dst_stride[plane] / sizeof(uint8_t);
+
+        const int ystart = frameData->field[plane];
+        const int ystop = height - 12;
+
+        srcp += (ystart + 6) * src_stride;
+        dstp += ystart * dst_stride - 32;
+        const uint8_t *srcpp = srcp - (ydia - 1) * src_stride - xdiad2m1;
+
+        for (int y = ystart; y < ystop; y += 2) {
+            for (int x = 32; x < width - 32; ++x) {
+                uint32_t pixel = 0;
+                memcpy(&pixel, dstp + x, sizeof(uint8_t));
+
+                uint32_t all_ones = 0;
+                memset(&all_ones, 255, sizeof(uint8_t));
+
+                if (pixel != all_ones)
+                    continue;
+
+                float mstd[4];
+                d->extract((const uint8_t *)(srcpp + x), src_stride, xdia, ydia, mstd, input);
+                for (int i = 0; i < qual; ++i) {
+                    d->dotProd(d, input, weights1[i], temp, nns * 2, asize, mstd + 2);
+                    d->expfunc(temp, nns);
+                    d->wae5(temp, nns, mstd);
+                }
+
+                dstp[x] = FFMIN(FFMAX((int)(mstd[3] * scale + 0.5f), 0), d->max_value);
+            }
+            srcpp += src_stride * 2;
+            dstp += dst_stride * 2;
+        }
+    }
+}
+
+#define NUM_NSIZE 7
+#define NUM_NNS 5
+
+static int roundds(const double f)
+{
+    if (f - floor(f) >= 0.5)
+        return FFMIN((int)ceil(f), 32767);
+    return FFMAX((int)floor(f), -32768);
+}
+
+static void selectFunctions(NNEDIContext *d)
+{
+    d->copyPad = copyPad;
+    d->evalFunc_0 = evalFunc_0;
+    d->evalFunc_1 = evalFunc_1;
+
+    // evalFunc_0
+    d->processLine0 = processLine0_C;
+
+    if (d->pscrn < 2) { // original prescreener
+        if (d->fapprox & 1) { // int16 dot products
+            d->readPixels = byte2word48_C;
+            d->computeNetwork0 = computeNetwork0_i16_C;
+        } else {
+            d->readPixels = pixel2float48_C;
+            d->computeNetwork0 = computeNetwork0_C;
+        }
+    } else { // new prescreener
+        // only int16 dot products
+        d->readPixels = byte2word64_C;
+        d->computeNetwork0 = computeNetwork0new_C;
+    }
+
+    // evalFunc_1
+    d->wae5 = weightedAvgElliottMul5_m16_C;
+
+    if (d->fapprox & 2) { // use int16 dot products
+        d->extract = extract_m8_i16_C;
+        d->dotProd = dotProdS_C;
+    } else { // use float dot products
+        d->extract = extract_m8_C;
+        d->dotProd = dotProd_C;
+    }
+
+    d->expfunc = e2_m16_C;
+}
+
+static int modnpf(const int m, const int n)
+{
+    if ((m % n) == 0)
+        return m;
+    return m + n - (m % n);
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *src)
+{
+    AVFilterContext *ctx = inlink->dst;
+    NNEDIContext *d = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    int effective_field = d->field;
+
+    if (effective_field > 1)
+        effective_field -= 2;
+
+    if (d->fieldbased == 1)
+        effective_field = 0;
+    else if (d->fieldbased == 2)
+        effective_field = 1;
+
+    int field_n;
+    if (d->field > 1) {
+        if (inlink->frame_count & 1) {
+            field_n = (effective_field == 0);
+        } else {
+            field_n = (effective_field == 1);
+        }
+    } else {
+        field_n = effective_field;
+    }
+
+    AVFrame *dst = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    dst->interlaced_frame = 0;
+
+    FrameData *frameData = av_calloc(1, sizeof(FrameData));
+
+    for (int plane = 0; plane < d->nb_planes; plane++) {
+        if (!(d->process_plane & (1 << plane)))
+            continue;
+
+        const int min_pad = 10;
+        const int min_alignment = 16;
+
+        int dst_width = d->linesize[plane];
+        int dst_height = d->planeheight[plane];
+
+        frameData->padded_width[plane]  = dst_width + 64;
+        frameData->padded_height[plane] = dst_height + 12;
+        frameData->padded_stride[plane] = modnpf(frameData->padded_width[plane] + min_pad, min_alignment); // TODO: maybe min_pad is in pixels too?
+        frameData->paddedp[plane] = av_malloc((size_t)frameData->padded_stride[plane] * (size_t)frameData->padded_height[plane]);
+
+        frameData->dstp[plane] = dst->data[plane];
+        frameData->dst_stride[plane] = dst->linesize[plane];
+
+        frameData->lcount[plane] = av_calloc(dst_height, sizeof(int32_t) * 16);
+
+        frameData->field[plane] = field_n;
+    }
+
+    frameData->input = av_malloc(512 * sizeof(float));
+    // evalFunc_0 requires at least padded_width[0] bytes.
+    // evalFunc_1 requires at least 512 floats.
+    size_t temp_size = FFMAX((size_t)frameData->padded_width[0], 512 * sizeof(float));
+    frameData->temp = av_malloc(temp_size);
+
+    // Copy src to a padded "frame" in frameData and mirror the edges.
+    d->copyPad(src, frameData, d, field_n);
+
+    // Handles prescreening and the cubic interpolation.
+    d->evalFunc_0(d, frameData);
+
+    // The rest.
+    d->evalFunc_1(d, frameData);
+
+    // Clean up.
+    for (int plane = 0; plane < d->nb_planes; plane++) {
+        if (!(d->process_plane & (1 << plane)))
+            continue;
+
+        av_freep(&frameData->paddedp[plane]);
+        av_freep(&frameData->lcount[plane]);
+    }
+    av_freep(&frameData->input);
+    av_freep(&frameData->temp);
+    av_free(frameData);
+
+    dst->pts = src->pts;
+    av_frame_free(&src);
+
+    return ff_filter_frame(outlink, dst);
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    NNEDIContext *s = ctx->priv;
+
+    av_freep(&s->weights0);
+
+    for (int i = 0; i < 2; i++)
+        av_freep(&s->weights1[i]);
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+    NNEDIContext *s = ctx->priv;
+    FILE *weights_file = NULL;
+
+    weights_file = fopen(s->weights_file, "rb");
+
+    if (!weights_file) {
+        return AVERROR(EINVAL);
+    }
+
+    if (fseek(weights_file, 0, SEEK_END)) {
+        fclose(weights_file);
+        return AVERROR(EINVAL);
+    }
+
+    long expected_size = 13574928;
+    long weights_size = ftell(weights_file);
+
+    if (weights_size == -1) {
+        fclose(weights_file);
+        return AVERROR(EINVAL);
+    } else if (weights_size != expected_size) {
+        fclose(weights_file);
+        return AVERROR(EINVAL);
+    }
+
+    if (fseek(weights_file, 0, SEEK_SET)) {
+        fclose(weights_file);
+        return AVERROR(EINVAL);
+    }
+
+    float *bdata = (float *)av_malloc(expected_size);
+    size_t bytes_read = fread(bdata, 1, expected_size, weights_file);
+
+    if (bytes_read != (size_t)expected_size) {
+        fclose(weights_file);
+        av_free(bdata);
+        return AVERROR(EINVAL);
+    }
+
+    fclose(weights_file);
+
+    const int xdiaTable[NUM_NSIZE] = { 8, 16, 32, 48, 8, 16, 32 };
+    const int ydiaTable[NUM_NSIZE] = { 6, 6, 6, 6, 4, 4, 4 };
+    const int nnsTable[NUM_NNS] = { 16, 32, 64, 128, 256 };
+
+    const int dims0 = 49 * 4 + 5 * 4 + 9 * 4;
+    const int dims0new = 4 * 65 + 4 * 5;
+    const int dims1 = nnsTable[s->nnsparam] * 2 * (xdiaTable[s->nsize] * ydiaTable[s->nsize] + 1);
+    int dims1tsize = 0;
+    int dims1offset = 0;
+
+    for (int j = 0; j < NUM_NNS; ++j) {
+        for (int i = 0; i < NUM_NSIZE; ++i) {
+            if (i == s->nsize && j == s->nnsparam)
+                dims1offset = dims1tsize;
+            dims1tsize += nnsTable[j] * 2 * (xdiaTable[i] * ydiaTable[i] + 1) * 2;
+        }
+    }
+
+    s->weights0 = av_malloc(FFMAX(dims0, dims0new) * sizeof(float));
+
+    for (int i = 0; i < 2; ++i)
+        s->weights1[i] = av_malloc(dims1 * sizeof(float));
+
+
+    // Adjust prescreener weights
+    if (s->pscrn >= 2) {// using new prescreener
+        int *offt = (int *)av_calloc(4 * 64, sizeof(int));
+        for (int j = 0; j < 4; ++j)
+            for (int k = 0; k < 64; ++k)
+                offt[j * 64 + k] = ((k >> 3) << 5) + ((j & 3) << 3) + (k & 7);
+        const float *bdw = bdata + dims0 + dims0new * (s->pscrn - 2);
+        int16_t *ws = (int16_t *)s->weights0;
+        float *wf = (float *)&ws[4 * 64];
+        double mean[4] = { 0.0, 0.0, 0.0, 0.0 };
+        // Calculate mean weight of each first layer neuron
+        for (int j = 0; j < 4; ++j) {
+            double cmean = 0.0;
+            for (int k = 0; k < 64; ++k)
+                cmean += bdw[offt[j * 64 + k]];
+            mean[j] = cmean / 64.0;
+        }
+        // Factor mean removal and 1.0/127.5 scaling 
+        // into first layer weights. scale to int16 range
+        for (int j = 0; j < 4; ++j) {
+            double mval = 0.0;
+            for (int k = 0; k < 64; ++k)
+                mval = FFMAX(mval, FFABS((bdw[offt[j * 64 + k]] - mean[j]) / 127.5));
+            const double scale = 32767.0 / mval;
+            for (int k = 0; k < 64; ++k)
+                ws[offt[j * 64 + k]] = roundds(((bdw[offt[j * 64 + k]] - mean[j]) / 127.5) * scale);
+            wf[j] = (float)(mval / 32767.0);
+        }
+        memcpy(wf + 4, bdw + 4 * 64, (dims0new - 4 * 64) * sizeof(float));
+        av_free(offt);
+    } else {// using old prescreener
+        double mean[4] = { 0.0, 0.0, 0.0, 0.0 };
+        // Calculate mean weight of each first layer neuron
+        for (int j = 0; j < 4; ++j) {
+            double cmean = 0.0;
+            for (int k = 0; k < 48; ++k)
+                cmean += bdata[j * 48 + k];
+            mean[j] = cmean / 48.0;
+        }
+        if (s->fapprox & 1) {// use int16 dot products in first layer
+            int16_t *ws = (int16_t *)s->weights0;
+            float *wf = (float *)&ws[4 * 48];
+            // Factor mean removal and 1.0/127.5 scaling 
+            // into first layer weights. scale to int16 range
+            for (int j = 0; j < 4; ++j) {
+                double mval = 0.0;
+                for (int k = 0; k < 48; ++k)
+                    mval = FFMAX(mval, FFABS((bdata[j * 48 + k] - mean[j]) / 127.5));
+                const double scale = 32767.0 / mval;
+                for (int k = 0; k < 48; ++k)
+                    ws[j * 48 + k] = roundds(((bdata[j * 48 + k] - mean[j]) / 127.5) * scale);
+                wf[j] = (float)(mval / 32767.0);
+            }
+            memcpy(wf + 4, bdata + 4 * 48, (dims0 - 4 * 48) * sizeof(float));
+        } else {// use float dot products in first layer
+            double half = (1 << 8) - 1;
+
+            half /= 2;
+
+            // Factor mean removal and 1.0/half scaling
+            // into first layer weights.
+            for (int j = 0; j < 4; ++j)
+                for (int k = 0; k < 48; ++k)
+                    s->weights0[j * 48 + k] = (float)((bdata[j * 48 + k] - mean[j]) / half);
+            memcpy(s->weights0 + 4 * 48, bdata + 4 * 48, (dims0 - 4 * 48) * sizeof(float));
+        }
+    }
+
+    // Adjust prediction weights
+    for (int i = 0; i < 2; ++i) {
+        const float *bdataT = bdata + dims0 + dims0new * 3 + dims1tsize * s->etype + dims1offset + i * dims1;
+        const int nnst = nnsTable[s->nnsparam];
+        const int asize = xdiaTable[s->nsize] * ydiaTable[s->nsize];
+        const int boff = nnst * 2 * asize;
+        double *mean = (double *)av_calloc(asize + 1 + nnst * 2, sizeof(double));
+        // Calculate mean weight of each neuron (ignore bias)
+        for (int j = 0; j < nnst * 2; ++j) {
+            double cmean = 0.0;
+            for (int k = 0; k < asize; ++k)
+                cmean += bdataT[j * asize + k];
+            mean[asize + 1 + j] = cmean / (double)asize;
+        }
+        // Calculate mean softmax neuron
+        for (int j = 0; j < nnst; ++j) {
+            for (int k = 0; k < asize; ++k)
+                mean[k] += bdataT[j * asize + k] - mean[asize + 1 + j];
+            mean[asize] += bdataT[boff + j];
+        }
+        for (int j = 0; j < asize + 1; ++j)
+            mean[j] /= (double)(nnst);
+
+        if (s->fapprox & 2) {// use int16 dot products
+            int16_t *ws = (int16_t *)s->weights1[i];
+            float *wf = (float *)&ws[nnst * 2 * asize];
+            // Factor mean removal into weights, remove global offset from
+            // softmax neurons, and scale weights to int16 range.
+            for (int j = 0; j < nnst; ++j) {// softmax neurons
+                double mval = 0.0;
+                for (int k = 0; k < asize; ++k)
+                    mval = FFMAX(mval, FFABS(bdataT[j * asize + k] - mean[asize + 1 + j] - mean[k]));
+                const double scale = 32767.0 / mval;
+                for (int k = 0; k < asize; ++k)
+                    ws[j * asize + k] = roundds((bdataT[j * asize + k] - mean[asize + 1 + j] - mean[k]) * scale);
+                wf[(j >> 2) * 8 + (j & 3)] = (float)(mval / 32767.0);
+                wf[(j >> 2) * 8 + (j & 3) + 4] = (float)(bdataT[boff + j] - mean[asize]);
+            }
+            for (int j = nnst; j < nnst * 2; ++j) {// elliott neurons
+                double mval = 0.0;
+                for (int k = 0; k < asize; ++k)
+                    mval = FFMAX(mval, FFABS(bdataT[j * asize + k] - mean[asize + 1 + j]));
+                const double scale = 32767.0 / mval;
+                for (int k = 0; k < asize; ++k)
+                    ws[j * asize + k] = roundds((bdataT[j * asize + k] - mean[asize + 1 + j]) * scale);
+                wf[(j >> 2) * 8 + (j & 3)] = (float)(mval / 32767.0);
+                wf[(j >> 2) * 8 + (j & 3) + 4] = bdataT[boff + j];
+            }
+        } else {// use float dot products
+            // Factor mean removal into weights, and remove global
+            // offset from softmax neurons.
+            for (int j = 0; j < nnst * 2; ++j) {
+                for (int k = 0; k < asize; ++k) {
+                    const double q = j < nnst ? mean[k] : 0.0;
+                    s->weights1[i][j * asize + k] = (float)(bdataT[j * asize + k] - mean[asize + 1 + j] - q);
+                }
+                s->weights1[i][boff + j] = (float)(bdataT[boff + j] - (j < nnst ? mean[asize] : 0.0));
+            }
+        }
+        av_free(mean);
+    }
+
+    s->nns = nnsTable[s->nnsparam];
+    s->xdia = xdiaTable[s->nsize];
+    s->ydia = ydiaTable[s->nsize];
+    s->asize = xdiaTable[s->nsize] * ydiaTable[s->nsize];
+
+    av_free(bdata);
+    s->max_value = 65535 >> 8;
+
+    selectFunctions(s);
+
+    s->fdsp = avpriv_float_dsp_alloc(0);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static const AVFilterPad inputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .filter_frame  = filter_frame,
+        .config_props  = config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_nnedi = {
+    .name          = "nnedi",
+    .description   = NULL_IF_CONFIG_SMALL("Apply neural network edge directed interpolation."),
+    .priv_size     = sizeof(NNEDIContext),
+    .priv_class    = &nnedi_class,
+    .init          = init,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    .inputs        = inputs,
+    .outputs       = outputs,
+};
-- 
1.9.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to