Re: [FFmpeg-devel] [PATCH] Add NVENC encoder

Nicolas George Wed, 26 Nov 2014 12:38:26 -0800

Le sextidi 6 frimaire, an CCXXIII, Timo Rothenpieler a écrit :
> It uses init_static_data to dynamicaly ask the nvidia driver for the
> supported pixel formats instead.


It means it will try to load and init the library whenever libavcodec is
used, even if this specific encoder is not used. For a library that accesses
hardware devices, that may not be a good idea.

Below, a few quick comments that became a lot of comments; I do not know the
API itself.


> From 793271822a5f52c3aed876fcedc7c6d8edd3c10c Mon Sep 17 00:00:00 2001
> From: Timo Rothenpieler <t...@rothenpieler.org>
> Date: Wed, 26 Nov 2014 11:08:11 +0100
> Subject: [PATCH] Add NVENC encoder
> 
> ---
>  Changelog               |   1 +
>  configure               |  12 +-
>  libavcodec/Makefile     |   1 +
>  libavcodec/allcodecs.c  |   1 +

>  libavcodec/nvenc.c      | 932 
> ++++++++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/nvenc_api.c  | 275 ++++++++++++++
>  libavcodec/nvenc_api.h  |  35 ++
>  libavcodec/nvenc_cuda.h |  62 ++++

Is it necessary to split the _api part in a separate file? The whole code is
a bit large, but still manageable, and merging the files would avoid some
headers overhead.

>  8 files changed, 1317 insertions(+), 2 deletions(-)
>  create mode 100644 libavcodec/nvenc.c
>  create mode 100644 libavcodec/nvenc_api.c
>  create mode 100644 libavcodec/nvenc_api.h
>  create mode 100644 libavcodec/nvenc_cuda.h
> 
> diff --git a/Changelog b/Changelog
> index 7172d0c..d26b7fa 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -17,6 +17,7 @@ version <next>:
>  - WebP muxer with animated WebP support
>  - zygoaudio decoding support
>  - APNG demuxer
> +- nvenc encoder
>  
>  
>  version 2.4:
> diff --git a/configure b/configure
> index 38619c4..05bce5d 100755
> --- a/configure
> +++ b/configure
> @@ -261,6 +261,7 @@ External library support:
>    --enable-libzvbi         enable teletext support via libzvbi [no]
>    --disable-lzma           disable lzma [autodetect]
>    --enable-decklink        enable Blackmagick DeckLink I/O support [no]
> +  --enable-nvenc           enable NVIDIA NVENC support [no]
>    --enable-openal          enable OpenAL 1.1 capture support [no]
>    --enable-opencl          enable OpenCL code
>    --enable-opengl          enable OpenGL rendering [no]
> @@ -1393,6 +1394,7 @@ EXTERNAL_LIBRARY_LIST="
>      libzmq
>      libzvbi
>      lzma
> +    nvenc
>      openal
>      opencl
>      opengl
> @@ -2389,6 +2391,7 @@ libxvid_encoder_deps="libxvid"
>  libutvideo_decoder_deps="libutvideo"
>  libutvideo_encoder_deps="libutvideo"
>  libzvbi_teletext_decoder_deps="libzvbi"
> +nvenc_encoder_deps="nvenc"
>  
>  # demuxers / muxers
>  ac3_demuxer_select="ac3_parser"
> @@ -2569,9 +2572,7 @@ drawtext_filter_deps="libfreetype"
>  ebur128_filter_deps="gpl"
>  flite_filter_deps="libflite"
>  frei0r_filter_deps="frei0r dlopen"

> -frei0r_filter_extralibs='$ldl'
>  frei0r_src_filter_deps="frei0r dlopen"
> -frei0r_src_filter_extralibs='$ldl'
>  geq_filter_deps="gpl"
>  histeq_filter_deps="gpl"
>  hqdn3d_filter_deps="gpl"
> @@ -4344,6 +4345,7 @@ die_license_disabled gpl x11grab
>  
>  die_license_disabled nonfree libaacplus
>  die_license_disabled nonfree libfaac
> +die_license_disabled nonfree nvenc
>  enabled gpl && die_license_disabled_gpl nonfree libfdk_aac
>  enabled gpl && die_license_disabled_gpl nonfree openssl
>  
> @@ -4650,6 +4652,11 @@ elif check_func dlopen -ldl; then
>      ldl=-ldl
>  fi
>  
> +# set a few flags which depend on ldl and can't be set earlier
> +nvenc_encoder_extralibs='$ldl'

> +frei0r_filter_extralibs='$ldl'
> +frei0r_src_filter_extralibs='$ldl'

I think moving the frei0r rules is supposed to belong in a separate patch.

> +
>  if ! disabled network; then
>      check_func getaddrinfo $network_extralibs
>      check_func getservbyport $network_extralibs
> @@ -4913,6 +4920,7 @@ enabled libxavs           && require libxavs xavs.h 
> xavs_encoder_encode -lxavs
>  enabled libxvid           && require libxvid xvid.h xvid_global -lxvidcore
>  enabled libzmq            && require_pkg_config libzmq zmq.h zmq_ctx_new
>  enabled libzvbi           && require libzvbi libzvbi.h vbi_decoder_new -lzvbi
> +enabled nvenc             && { check_header nvEncodeAPI.h || die "ERROR: 
> nvEncodeAPI.h not found."; }
>  enabled openal            && { { for al_libs in "${OPENAL_LIBS}" "-lopenal" 
> "-lOpenAL32"; do
>                                 check_lib 'AL/al.h' alGetError "${al_libs}" 
> && break; done } ||
>                                 die "ERROR: openal not found"; } &&
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index fa0f53d..cc41564 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -347,6 +347,7 @@ OBJS-$(CONFIG_MXPEG_DECODER)           += mxpegdec.o
>  OBJS-$(CONFIG_NELLYMOSER_DECODER)      += nellymoserdec.o nellymoser.o
>  OBJS-$(CONFIG_NELLYMOSER_ENCODER)      += nellymoserenc.o nellymoser.o
>  OBJS-$(CONFIG_NUV_DECODER)             += nuv.o rtjpeg.o
> +OBJS-$(CONFIG_NVENC_ENCODER)           += nvenc.o nvenc_api.o
>  OBJS-$(CONFIG_ON2AVC_DECODER)          += on2avc.o on2avcdata.o
>  OBJS-$(CONFIG_OPUS_DECODER)            += opusdec.o opus.o opus_celt.o \
>                                            opus_imdct.o opus_silk.o     \
> diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
> index 0d39d33..8ceee2f 100644
> --- a/libavcodec/allcodecs.c
> +++ b/libavcodec/allcodecs.c
> @@ -223,6 +223,7 @@ void avcodec_register_all(void)
>      REGISTER_DECODER(MVC2,              mvc2);
>      REGISTER_DECODER(MXPEG,             mxpeg);
>      REGISTER_DECODER(NUV,               nuv);
> +    REGISTER_ENCODER(NVENC,             nvenc);
>      REGISTER_DECODER(PAF_VIDEO,         paf_video);
>      REGISTER_ENCDEC (PAM,               pam);
>      REGISTER_ENCDEC (PBM,               pbm);
> diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
> new file mode 100644
> index 0000000..3cb98d3
> --- /dev/null
> +++ b/libavcodec/nvenc.c
> @@ -0,0 +1,932 @@
> +/*
> + * H.264 hardware encoding using nvidia nvenc
> + * Copyright (c) 2014 Timo Rothenpieler <t...@rothenpieler.org>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#ifdef _WIN32
> +#include <windows.h>
> +#endif
> +
> +#include "libavutil/internal.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/avassert.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/mem.h"
> +#include "avcodec.h"
> +#include "internal.h"
> +
> +#include "nvenc_cuda.h"
> +#include "nvenc_api.h"
> +

> +typedef struct NvencInputSurface
> +{
> +    NV_ENC_INPUT_PTR inputSurface;
> +    int width;
> +    int height;
> +

> +    int lockCount;

The usual coding style for structure members and variables in ffmpeg is
names_separated_with_underscodes, not uglyCamelCase. (But I believe the
person who will end up maintaining the file should have last word on this.)

> +
> +    NV_ENC_BUFFER_FORMAT format;
> +} NvencInputSurface;
> +
> +typedef struct NvencOutputSurface
> +{
> +    NV_ENC_OUTPUT_PTR outputSurface;
> +    int size;
> +
> +    NvencInputSurface *inputSurface;
> +
> +    int busy;
> +} NvencOutputSurface;
> +
> +typedef struct NvencOutputSurfaceList
> +{
> +    NvencOutputSurface *surface;
> +    struct NvencOutputSurfaceList *next;
> +} NvencOutputSurfaceList;
> +
> +typedef struct NvencTimestampList
> +{
> +    int64_t timestamp;
> +    struct NvencTimestampList *next;
> +} NvencTimestampList;
> +
> +typedef struct NvencContext
> +{
> +    AVClass *avclass;
> +
> +    NV_ENC_INITIALIZE_PARAMS initEncodeParams;
> +    NV_ENC_CONFIG encodeConfig;
> +    CUcontext cuContext;
> +
> +    int maxSurfaceCount;
> +    NvencInputSurface *inputSurfaces;
> +    NvencOutputSurface *outputSurfaces;
> +
> +    NvencOutputSurfaceList *outputSurfaceQueue;
> +    NvencOutputSurfaceList *outputSurfaceReadyQueue;
> +    NvencTimestampList *timestampList;
> +    int64_t lastDts;
> +
> +    void *nvencoder;
> +
> +    char *profile;
> +    char *preset;
> +    int cqp;
> +    int cbr;
> +    int twopass;
> +    int gobpattern;
> +} NvencContext;
> +
> +static const GUID dummy_license = { 0x0, 0x0, 0x0, { 0x0, 0x0, 0x0, 0x0, 
> 0x0, 0x0, 0x0, 0x0 } };
> +

> +static void out_surf_queue_push(NvencOutputSurfaceList** head, 
> NvencOutputSurface *surface)
> +{
> +    if (!*head) {
> +        *head = av_malloc(sizeof(NvencOutputSurfaceList));

> +        (*head)->next = 0;

ffmpeg code usually uses NULL for NULL pointers, not 0; other similar cases
below.

> +        (*head)->surface = surface;
> +        return;
> +    }
> +
> +    while ((*head)->next)
> +        head = &((*head)->next);

This looks inefficient. Do you have an estimate of the usual size of the
queue?

I suggest you have a look at the dynarray (in libavutil/mem.h and
dynarray.h) API.

If you really need linked lists, you could probably keep the final pointer
to head in the structure to avoid walking the list every time.

> +

> +    (*head)->next = av_malloc(sizeof(NvencOutputSurfaceList));

av_malloc() return value needs to be checked. Other similar cases below.

> +    (*head)->next->next = 0;
> +    (*head)->next->surface = surface;
> +}
> +

> +static NvencOutputSurface *out_surf_queue_pop(NvencOutputSurfaceList** head)

If you call this one pop instead of shift, people used to Perl will be very
confused.

> +{
> +    NvencOutputSurfaceList *tmp;
> +    NvencOutputSurface *res;
> +
> +    if (!*head)
> +        return 0;
> +
> +    tmp = *head;
> +    res = tmp->surface;
> +    *head = tmp->next;
> +    av_free(tmp);
> +
> +    return res;
> +}
> +

> +static void timestamp_list_insert_sorted(NvencTimestampList** head, int64_t 
> timestamp)

Same as before: maybe dynarray would be more efficient, avoiding malloc()
with its huge overhead for every insertion.

Also, if the list is expected to be large, you may consider using a heap
instead of a sorted list.

> +{
> +    NvencTimestampList *newelem;
> +    NvencTimestampList *prev;
> +
> +    if (!*head) {
> +        *head = av_malloc(sizeof(NvencTimestampList));
> +        (*head)->next = 0;
> +        (*head)->timestamp = timestamp;
> +        return;
> +    }
> +
> +    prev = 0;
> +    while (*head && timestamp >= (*head)->timestamp) {
> +        prev = *head;
> +        head = &((*head)->next);
> +    }
> +
> +    newelem = av_malloc(sizeof(NvencTimestampList));
> +    newelem->next = *head;
> +    newelem->timestamp = timestamp;
> +
> +    if (*head) {
> +        *head = newelem;
> +    } else {
> +        prev->next = newelem;
> +    }
> +}
> +
> +static int64_t timestamp_list_get_lowest(NvencTimestampList** head)
> +{
> +    NvencTimestampList *tmp;
> +    int64_t res;
> +
> +    if (!*head)
> +        return 0;
> +
> +    tmp = *head;
> +    res = tmp->timestamp;
> +    *head = tmp->next;
> +    av_free(tmp);
> +
> +    return res;
> +}
> +
> +static av_cold int nvenc_encode_init(AVCodecContext *avctx)
> +{
> +    NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS stEncodeSessionParams = { 0 };
> +    NV_ENC_PRESET_CONFIG presetConfig = { 0 };
> +    CUcontext cuContextCurr;
> +    GUID encoderPreset = NV_ENC_PRESET_HQ_GUID;
> +    GUID license = dummy_license;
> +    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
> +    int surfaceCount = 0;
> +    int i, numMBs;
> +    int isLL = 0;
> +
> +    NvencContext *ctx = avctx->priv_data;
> +
> +    if (!ff_nvenc_dyload_nvenc(avctx))
> +        return AVERROR_EXTERNAL;
> +
> +    avctx->coded_frame = av_frame_alloc();
> +    if (!avctx->coded_frame)
> +        return AVERROR(ENOMEM);
> +

> +    memset(&ctx->initEncodeParams, 0, sizeof(NV_ENC_INITIALIZE_PARAMS));
> +    memset(&ctx->encodeConfig, 0, sizeof(NV_ENC_CONFIG));

Non needed, the whole structure is set to 0 by the library.

> +
> +    ctx->outputSurfaceQueue = 0;
> +    ctx->outputSurfaceReadyQueue = 0;
> +    ctx->timestampList = 0;
> +    ctx->lastDts = AV_NOPTS_VALUE;
> +    ctx->nvencoder = 0;
> +
> +    ctx->encodeConfig.version = NV_ENC_CONFIG_VER;
> +    ctx->initEncodeParams.version = NV_ENC_INITIALIZE_PARAMS_VER;
> +    presetConfig.version = NV_ENC_PRESET_CONFIG_VER;
> +    presetConfig.presetCfg.version = NV_ENC_CONFIG_VER;
> +    stEncodeSessionParams.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
> +    stEncodeSessionParams.apiVersion = NVENCAPI_VERSION;
> +    stEncodeSessionParams.clientKeyPtr = &license;
> +
> +    ctx->cuContext = 0;
> +    if (ff_cuCtxCreate(&ctx->cuContext, 0, 
> ff_pNvencDevices[ff_iNvencUseDeviceID]) != CUDA_SUCCESS
> +            || ff_cuCtxPopCurrent(&cuContextCurr) != CUDA_SUCCESS) {

> +        av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for 
> NVENC\n");

Is there a chance of getting a more detailed error reason?

> +        goto error;
> +    }
> +
> +    stEncodeSessionParams.device = (void*)ctx->cuContext;
> +    stEncodeSessionParams.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
> +
> +    nvStatus = ff_pNvEnc->nvEncOpenEncodeSessionEx(&stEncodeSessionParams, 
> &ctx->nvencoder);
> +    if (nvStatus != NV_ENC_SUCCESS) {
> +        ctx->nvencoder = 0;
> +        av_log(avctx, AV_LOG_FATAL, "OpenEncodeSessionEx failed: 0x%x - 
> invalid license key?\n", (int)nvStatus);
> +        goto error;
> +    }
> +
> +    if (ctx->preset) {

> +        if (!strcmp(ctx->preset, "hp")) {
> +            encoderPreset = NV_ENC_PRESET_HP_GUID;
> +        } else if (!strcmp(ctx->preset, "hq")) {
> +            encoderPreset = NV_ENC_PRESET_HQ_GUID;
> +        } else if (!strcmp(ctx->preset, "bd")) {
> +            encoderPreset = NV_ENC_PRESET_BD_GUID;
> +        } else if (!strcmp(ctx->preset, "ll")) {
> +            encoderPreset = NV_ENC_PRESET_LOW_LATENCY_DEFAULT_GUID;
> +            isLL = 1;
> +        } else if (!strcmp(ctx->preset, "llhp")) {
> +            encoderPreset = NV_ENC_PRESET_LOW_LATENCY_HP_GUID;
> +            isLL = 1;
> +        } else if (!strcmp(ctx->preset, "llhq")) {
> +            encoderPreset = NV_ENC_PRESET_LOW_LATENCY_HQ_GUID;
> +            isLL = 1;
> +        } else if (!strcmp(ctx->preset, "default")) {
> +            encoderPreset = NV_ENC_PRESET_DEFAULT_GUID;
> +        } else {

> +            av_log(avctx, AV_LOG_ERROR, "Preset \"%s\" is unknown!\n", 
> ctx->preset);

Should return an error. And if you use a table with the list of presets, you
can dump the list.

> +        }
> +    }
> +
> +    nvStatus = ff_pNvEnc->nvEncGetEncodePresetConfig(ctx->nvencoder, 
> NV_ENC_CODEC_H264_GUID, encoderPreset, &presetConfig);
> +    if (nvStatus != NV_ENC_SUCCESS) {
> +        av_log(avctx, AV_LOG_FATAL, "GetEncodePresetConfig failed: 0x%x\n", 
> (int)nvStatus);
> +        goto error;
> +    }
> +
> +    ctx->initEncodeParams.encodeGUID = NV_ENC_CODEC_H264_GUID;
> +    ctx->initEncodeParams.encodeHeight = avctx->height;
> +    ctx->initEncodeParams.encodeWidth = avctx->width;

> +    ctx->initEncodeParams.darHeight = avctx->height;
> +    ctx->initEncodeParams.darWidth = avctx->width;

Was this tested with anamorphic videos?

> +    ctx->initEncodeParams.frameRateNum = avctx->time_base.den;
> +    ctx->initEncodeParams.frameRateDen = avctx->time_base.num * 
> avctx->ticks_per_frame;
> +
> +    numMBs = ((avctx->width + 15) >> 4) * ((avctx->height + 15) >> 4);
> +    ctx->maxSurfaceCount = (numMBs >= 8160) ? 16 : 32;
> +
> +    ctx->initEncodeParams.enableEncodeAsync = 0;
> +    ctx->initEncodeParams.enablePTD = 1;
> +
> +    ctx->initEncodeParams.presetGUID = encoderPreset;
> +
> +    ctx->initEncodeParams.encodeConfig = &ctx->encodeConfig;
> +    memcpy(&ctx->encodeConfig, &presetConfig.presetCfg, 
> sizeof(NV_ENC_CONFIG));
> +    ctx->encodeConfig.version = NV_ENC_CONFIG_VER;
> +
> +    if (avctx->gop_size >= 0) {
> +        ctx->encodeConfig.gopLength = avctx->gop_size;
> +        ctx->encodeConfig.encodeCodecConfig.h264Config.idrPeriod = 
> avctx->gop_size;
> +    }
> +
> +    if (avctx->bit_rate > 0)
> +        ctx->encodeConfig.rcParams.averageBitRate = avctx->bit_rate;
> +
> +    if (avctx->rc_max_rate > 0)
> +        ctx->encodeConfig.rcParams.maxBitRate = avctx->rc_max_rate;
> +
> +    if (ctx->cbr) {
> +        if (!ctx->twopass) {
> +            ctx->encodeConfig.rcParams.rateControlMode = 
> NV_ENC_PARAMS_RC_CBR;
> +        } else if (ctx->twopass == 1 || isLL) {
> +            ctx->encodeConfig.rcParams.rateControlMode = 
> NV_ENC_PARAMS_RC_2_PASS_QUALITY;
> +
> +            
> ctx->encodeConfig.encodeCodecConfig.h264Config.adaptiveTransformMode = 
> NV_ENC_H264_ADAPTIVE_TRANSFORM_ENABLE;
> +            ctx->encodeConfig.encodeCodecConfig.h264Config.fmoMode = 
> NV_ENC_H264_FMO_DISABLE;
> +
> +            if (!isLL)
> +                av_log(avctx, AV_LOG_WARNING, "Twopass mode is only known to 
> work with low latency (ll, llhq, llhp) presets.\n");
> +        } else {
> +            ctx->encodeConfig.rcParams.rateControlMode = 
> NV_ENC_PARAMS_RC_CBR;
> +        }
> +    } else if (ctx->cqp >= 0) {
> +        ctx->encodeConfig.rcParams.rateControlMode = 
> NV_ENC_PARAMS_RC_CONSTQP;
> +        ctx->encodeConfig.rcParams.constQP.qpInterB = ctx->cqp;
> +        ctx->encodeConfig.rcParams.constQP.qpInterP = ctx->cqp;
> +        ctx->encodeConfig.rcParams.constQP.qpIntra = ctx->cqp;
> +
> +        avctx->qmin = -1;
> +        avctx->qmax = -1;
> +    } else if (avctx->qmin >= 0 && avctx->qmax >= 0) {
> +        ctx->encodeConfig.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR;
> +
> +        ctx->encodeConfig.rcParams.enableMinQP = 1;
> +        ctx->encodeConfig.rcParams.enableMaxQP = 1;
> +
> +        ctx->encodeConfig.rcParams.minQP.qpInterB = avctx->qmin;
> +        ctx->encodeConfig.rcParams.minQP.qpInterP = avctx->qmin;
> +        ctx->encodeConfig.rcParams.minQP.qpIntra = avctx->qmin;
> +
> +        ctx->encodeConfig.rcParams.maxQP.qpInterB = avctx->qmax;
> +        ctx->encodeConfig.rcParams.maxQP.qpInterP = avctx->qmax;
> +        ctx->encodeConfig.rcParams.maxQP.qpIntra = avctx->qmax;
> +    }
> +
> +    if (avctx->rc_buffer_size > 0)
> +        ctx->encodeConfig.rcParams.vbvBufferSize = avctx->rc_buffer_size;
> +
> +    if (avctx->flags & CODEC_FLAG_INTERLACED_DCT) {
> +        ctx->encodeConfig.frameFieldMode = 
> NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
> +    } else {
> +        ctx->encodeConfig.frameFieldMode = 
> NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
> +    }
> +
> +    if (!ctx->profile) {
> +        switch (avctx->profile) {

> +            case FF_PROFILE_H264_BASELINE:

case is usually intended the same as switch.

> +            ctx->profile = av_strdup("baseline");

Need to check the return value.

But it seems you have the private option "profile" conflicting with the
global option "profile", which is confusing, and possibly problematic, for
users.

> +            break;
> +            case FF_PROFILE_H264_MAIN:
> +            ctx->profile = av_strdup("main");
> +            break;
> +            default:
> +            ctx->profile = av_strdup("high");
> +            break;
> +        }
> +    }
> +
> +    ctx->encodeConfig.profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
> +
> +    if (!strcmp(ctx->profile, "high")) {
> +        ctx->encodeConfig.profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
> +    } else if (!strcmp(ctx->profile, "main")) {
> +        ctx->encodeConfig.profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
> +    } else if (!strcmp(ctx->profile, "baseline")) {
> +        ctx->encodeConfig.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
> +    } else {
> +        av_log(avctx, AV_LOG_WARNING, "Unknown profile requested: %s\n", 
> ctx->profile);
> +    }
> +
> +    if (ctx->gobpattern >= 0) {
> +        ctx->encodeConfig.frameIntervalP = 1;
> +    }
> +
> +    
> ctx->encodeConfig.encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag
>  = 1;
> +    
> ctx->encodeConfig.encodeCodecConfig.h264Config.h264VUIParameters.videoSignalTypePresentFlag
>  = 1;
> +
> +    
> ctx->encodeConfig.encodeCodecConfig.h264Config.h264VUIParameters.colourMatrix 
> = avctx->colorspace;
> +    
> ctx->encodeConfig.encodeCodecConfig.h264Config.h264VUIParameters.colourPrimaries
>  = avctx->color_primaries;
> +    
> ctx->encodeConfig.encodeCodecConfig.h264Config.h264VUIParameters.transferCharacteristics
>  = avctx->color_trc;
> +
> +    
> ctx->encodeConfig.encodeCodecConfig.h264Config.h264VUIParameters.videoFullRangeFlag
>  = avctx->color_range == AVCOL_RANGE_JPEG;
> +
> +    ctx->encodeConfig.encodeCodecConfig.h264Config.disableSPSPPS = 
> (avctx->flags & CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
> +
> +    nvStatus = ff_pNvEnc->nvEncInitializeEncoder(ctx->nvencoder, 
> &ctx->initEncodeParams);
> +    if (nvStatus != NV_ENC_SUCCESS) {
> +        av_log(avctx, AV_LOG_FATAL, "InitializeEncoder failed: 0x%x\n", 
> (int)nvStatus);
> +        goto error;
> +    }
> +

> +    ctx->inputSurfaces = (NvencInputSurface*)calloc(ctx->maxSurfaceCount, 
> sizeof(NvencInputSurface));
> +    ctx->outputSurfaces = (NvencOutputSurface*)calloc(ctx->maxSurfaceCount, 
> sizeof(NvencOutputSurface));

The cast is an ugly c++ism, and ffmpeg code recommends sizeof(*variable)
instead of sizeof(Type).

Do you need to use calloc instead of the corresponding av_ function?

Other similar cases below.

> +
> +    for (surfaceCount = 0; surfaceCount < ctx->maxSurfaceCount; 
> ++surfaceCount) {
> +        NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 };
> +        NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 };
> +        allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
> +        allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
> +
> +        allocSurf.width = (avctx->width + 31) & ~31;
> +        allocSurf.height = (avctx->height + 31) & ~31;
> +
> +        allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
> +
> +        switch (avctx->pix_fmt) {
> +            case AV_PIX_FMT_YUV420P:
> +            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL;
> +            break;
> +
> +            case AV_PIX_FMT_NV12:
> +            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
> +            break;
> +
> +            case AV_PIX_FMT_YUV444P:
> +            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
> +            break;
> +
> +            default:
> +            av_log(avctx, AV_LOG_FATAL, "Invalid input pixel format\n");
> +            goto error;
> +        }
> +
> +        nvStatus = ff_pNvEnc->nvEncCreateInputBuffer(ctx->nvencoder, 
> &allocSurf);
> +        if (nvStatus = NV_ENC_SUCCESS){
> +            av_log(avctx, AV_LOG_FATAL, "CreateInputBuffer failed\n");
> +            goto error;
> +        }
> +
> +        ctx->inputSurfaces[surfaceCount].lockCount = 0;
> +        ctx->inputSurfaces[surfaceCount].inputSurface = 
> allocSurf.inputBuffer;
> +        ctx->inputSurfaces[surfaceCount].format = allocSurf.bufferFmt;
> +        ctx->inputSurfaces[surfaceCount].width = allocSurf.width;
> +        ctx->inputSurfaces[surfaceCount].height = allocSurf.height;
> +

> +        allocOut.size = 1024 * 1024;

Maybe a comment to explain where this value comes from?

> +        allocOut.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
> +
> +        nvStatus = ff_pNvEnc->nvEncCreateBitstreamBuffer(ctx->nvencoder, 
> &allocOut);
> +        if (nvStatus = NV_ENC_SUCCESS) {
> +            av_log(avctx, AV_LOG_FATAL, "CreateBitstreamBuffer failed\n");
> +            ctx->outputSurfaces[surfaceCount++].outputSurface = 0;
> +            goto error;
> +        }
> +
> +        ctx->outputSurfaces[surfaceCount].outputSurface = 
> allocOut.bitstreamBuffer;
> +        ctx->outputSurfaces[surfaceCount].size = allocOut.size;
> +        ctx->outputSurfaces[surfaceCount].busy = 0;
> +    }
> +
> +    if (avctx->flags & CODEC_FLAG_GLOBAL_HEADER) {
> +        uint32_t outSize = 0;
> +        char tmpHeader[256];
> +        NV_ENC_SEQUENCE_PARAM_PAYLOAD payload = { 0 };
> +        payload.version = NV_ENC_SEQUENCE_PARAM_PAYLOAD_VER;
> +
> +        payload.spsppsBuffer = tmpHeader;
> +        payload.inBufferSize = 256;
> +        payload.outSPSPPSPayloadSize = &outSize;
> +
> +        nvStatus = ff_pNvEnc->nvEncGetSequenceParams(ctx->nvencoder, 
> &payload);
> +        if (nvStatus != NV_ENC_SUCCESS) {
> +            av_log(avctx, AV_LOG_FATAL, "GetSequenceParams failed\n");
> +            goto error;
> +        }
> +
> +        avctx->extradata_size = outSize;
> +        avctx->extradata = av_mallocz(outSize + 
> FF_INPUT_BUFFER_PADDING_SIZE);
> +
> +        memcpy(avctx->extradata, tmpHeader, outSize);
> +    } else {

> +        avctx->extradata = 0;
> +        avctx->extradata_size = 0;

Not needed.

> +    }
> +
> +    if (ctx->encodeConfig.frameIntervalP > 1)
> +        avctx->has_b_frames = 2;
> +
> +    if (ctx->encodeConfig.rcParams.averageBitRate > 0)
> +        avctx->bit_rate = ctx->encodeConfig.rcParams.averageBitRate;
> +
> +    return 0;
> +
> +error:
> +
> +    for (i = 0; i < surfaceCount; ++i) {
> +        ff_pNvEnc->nvEncDestroyInputBuffer(ctx->nvencoder, 
> ctx->inputSurfaces[i].inputSurface);
> +        if (ctx->outputSurfaces[i].outputSurface)
> +            ff_pNvEnc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, 
> ctx->outputSurfaces[i].outputSurface);
> +    }
> +
> +    if (ctx->nvencoder)
> +        ff_pNvEnc->nvEncDestroyEncoder(ctx->nvencoder);
> +
> +    if (ctx->cuContext)
> +        ff_cuCtxDestroy(ctx->cuContext);
> +
> +    ff_nvenc_unload_nvenc(avctx);
> +
> +    ctx->nvencoder = 0;
> +    ctx->cuContext = 0;
> +
> +    return AVERROR_EXTERNAL;
> +}
> +
> +static av_cold int nvenc_encode_close(AVCodecContext *avctx)
> +{
> +    NvencContext *ctx = avctx->priv_data;
> +    int i;
> +

> +    if (ctx->profile)
> +        av_freep(&ctx->profile);

Freeing NULL is valid, so you do not need to check beforehand. And in this
case, since ctx->profile is an option, it is automatically freed anyway.

> +
> +    if (avctx->extradata)
> +        av_freep(&avctx->extradata);

extradata is automatically freed for encoders.

> +
> +    while (ctx->timestampList)
> +        timestamp_list_get_lowest(&ctx->timestampList);
> +
> +    while (ctx->outputSurfaceReadyQueue)
> +        out_surf_queue_pop(&ctx->outputSurfaceReadyQueue);
> +
> +    while (ctx->outputSurfaceQueue)
> +        out_surf_queue_pop(&ctx->outputSurfaceQueue);
> +
> +    for (i = 0; i < ctx->maxSurfaceCount; ++i) {
> +        ff_pNvEnc->nvEncDestroyInputBuffer(ctx->nvencoder, 
> ctx->inputSurfaces[i].inputSurface);
> +        ff_pNvEnc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, 
> ctx->outputSurfaces[i].outputSurface);
> +    }
> +    ctx->maxSurfaceCount = 0;
> +
> +    ff_pNvEnc->nvEncDestroyEncoder(ctx->nvencoder);
> +    ctx->nvencoder = 0;
> +
> +    ff_cuCtxDestroy(ctx->cuContext);
> +    ctx->cuContext = 0;
> +
> +    ff_nvenc_unload_nvenc(avctx);
> +
> +    av_frame_free(&avctx->coded_frame);
> +
> +    return 0;
> +}
> +
> +static int process_output_surface(AVCodecContext *avctx, AVPacket *pkt, 
> AVFrame *coded_frame, NvencOutputSurface *tmpoutsurf)
> +{
> +    NvencContext *ctx = avctx->priv_data;
> +    uint32_t *sliceOffsets = 
> (uint32_t*)calloc(ctx->encodeConfig.encodeCodecConfig.h264Config.sliceModeData,
>  sizeof(uint32_t));
> +    NV_ENC_LOCK_BITSTREAM lockParams = { 0 };
> +    NVENCSTATUS nvStatus;
> +
> +    lockParams.version = NV_ENC_LOCK_BITSTREAM_VER;
> +
> +    lockParams.doNotWait = 0;
> +    lockParams.outputBitstream = tmpoutsurf->outputSurface;
> +    lockParams.sliceOffsets = sliceOffsets;
> +

> +    nvStatus = ff_pNvEnc->nvEncLockBitstream(ctx->nvencoder, &lockParams);
> +    if (nvStatus != NV_ENC_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed locking bitstream buffer\n");
> +        timestamp_list_get_lowest(&ctx->timestampList);
> +        return 0;

Looks like it should return an error.

> +    }
> +

> +    if (ff_alloc_packet2(avctx, pkt, lockParams.bitstreamSizeInBytes) < 0) {
> +        ff_pNvEnc->nvEncUnlockBitstream(ctx->nvencoder, 
> tmpoutsurf->outputSurface);
> +        timestamp_list_get_lowest(&ctx->timestampList);
> +        return 0;
> +    }

Same as above, and ff_alloc_packet2() already returns a proper error code.

> +
> +    memcpy(pkt->data, lockParams.bitstreamBufferPtr, 
> lockParams.bitstreamSizeInBytes);
> +
> +    nvStatus = ff_pNvEnc->nvEncUnlockBitstream(ctx->nvencoder, 
> tmpoutsurf->outputSurface);
> +    if (nvStatus != NV_ENC_SUCCESS)
> +        av_log(avctx, AV_LOG_ERROR, "Failed unlocking bitstream buffer, 
> expect the gates of mordor to open\n");
> +
> +    switch (lockParams.pictureType) {
> +        case NV_ENC_PIC_TYPE_IDR:
> +        pkt->flags |= AV_PKT_FLAG_KEY;
> +        case NV_ENC_PIC_TYPE_I:
> +        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
> +        break;
> +
> +        case NV_ENC_PIC_TYPE_P:
> +        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
> +        break;
> +
> +        case NV_ENC_PIC_TYPE_B:
> +        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
> +        break;
> +
> +        case NV_ENC_PIC_TYPE_BI:
> +        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI;
> +        break;
> +

> +        default:
> +        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_NONE;

Does this happen normally?

> +        break;
> +    }
> +
> +    pkt->pts = lockParams.outputTimeStamp;
> +    pkt->dts = timestamp_list_get_lowest(&ctx->timestampList) - 
> ctx->encodeConfig.frameIntervalP;
> +
> +    if (pkt->dts > pkt->pts)
> +        pkt->dts = pkt->pts;
> +
> +    if (ctx->lastDts != AV_NOPTS_VALUE && pkt->dts <= ctx->lastDts)
> +        pkt->dts = ctx->lastDts + 1;
> +
> +    ctx->lastDts = pkt->dts;
> +
> +    return 1;
> +}
> +
> +static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
> +    const AVFrame *frame, int *got_packet)
> +{
> +    NVENCSTATUS nvStatus;
> +    NvencContext *ctx = avctx->priv_data;
> +    NvencOutputSurface *tmpoutsurf;
> +    int i = 0;
> +
> +    NV_ENC_PIC_PARAMS picParams = { 0 };
> +    picParams.version = NV_ENC_PIC_PARAMS_VER;
> +
> +    if (frame) {
> +        NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 };
> +        NvencInputSurface *inSurf = 0;
> +

> +        for (i = 0; i < ctx->maxSurfaceCount; ++i)
> +            if (!ctx->inputSurfaces[i].lockCount)
> +                inSurf = &ctx->inputSurfaces[i];

Maybe a break here.

> +        av_assert0(inSurf);

Are you positively sure that an input surface will always be available?

> +
> +        inSurf->lockCount = 1;
> +
> +        lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
> +        lockBufferParams.inputBuffer = inSurf->inputSurface;
> +
> +        nvStatus = ff_pNvEnc->nvEncLockInputBuffer(ctx->nvencoder, 
> &lockBufferParams);
> +        if (nvStatus != NV_ENC_SUCCESS) {
> +            av_log(avctx, AV_LOG_ERROR, "Failed locking nvenc input 
> buffer\n");
> +            return 0;
> +        }
> +
> +        if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {

> +            uint8_t *buf = lockBufferParams.bufferDataPtr;
> +
> +            av_image_copy_plane(buf, lockBufferParams.pitch,
> +                frame->data[0], frame->linesize[0],
> +                avctx->width, avctx->height);
> +
> +            buf += inSurf->height * lockBufferParams.pitch;

Could be factored out, unless I am missing something.

> +
> +            av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
> +                frame->data[2], frame->linesize[2],
> +                avctx->width >> 1, avctx->height >> 1);
> +
> +            buf += (inSurf->height * lockBufferParams.pitch) >> 2;
> +
> +            av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
> +                frame->data[1], frame->linesize[1],
> +                avctx->width >> 1, avctx->height >> 1);
> +        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
> +            uint8_t *buf = lockBufferParams.bufferDataPtr;
> +
> +            av_image_copy_plane(buf, lockBufferParams.pitch,
> +                frame->data[0], frame->linesize[0],
> +                avctx->width, avctx->height);
> +
> +            buf += inSurf->height * lockBufferParams.pitch;
> +
> +            av_image_copy_plane(buf, lockBufferParams.pitch,
> +                frame->data[1], frame->linesize[1],
> +                avctx->width, avctx->height >> 1);
> +        } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
> +            uint8_t *buf = lockBufferParams.bufferDataPtr;
> +
> +            av_image_copy_plane(buf, lockBufferParams.pitch,
> +                frame->data[0], frame->linesize[0],
> +                avctx->width, avctx->height);
> +
> +            buf += inSurf->height * lockBufferParams.pitch;
> +
> +            av_image_copy_plane(buf, lockBufferParams.pitch,
> +                frame->data[1], frame->linesize[1],
> +                avctx->width, avctx->height);
> +
> +            buf += inSurf->height * lockBufferParams.pitch;
> +
> +            av_image_copy_plane(buf, lockBufferParams.pitch,
> +                frame->data[2], frame->linesize[2],
> +                avctx->width, avctx->height);
> +        } else {
> +            av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n");
> +            return AVERROR(EINVAL);
> +        }
> +
> +        nvStatus = ff_pNvEnc->nvEncUnlockInputBuffer(ctx->nvencoder, 
> inSurf->inputSurface);
> +        if (nvStatus != NV_ENC_SUCCESS) {
> +            av_log(avctx, AV_LOG_FATAL, "Failed unlocking input buffer!\n");
> +            return AVERROR_EXTERNAL;
> +        }
> +
> +        for (i = 0; i < ctx->maxSurfaceCount; ++i)
> +            if (!ctx->outputSurfaces[i].busy)
> +                break;
> +
> +        if (i == ctx->maxSurfaceCount) {
> +            inSurf->lockCount = 0;

> +            av_log(avctx, AV_LOG_ERROR, "No free output surface found!\n");
> +            return 0;

Proper error code?

> +        }
> +
> +        ctx->outputSurfaces[i].inputSurface = inSurf;
> +
> +        picParams.inputBuffer = inSurf->inputSurface;
> +        picParams.bufferFmt = inSurf->format;
> +        picParams.inputWidth = avctx->width;
> +        picParams.inputHeight = avctx->height;
> +        picParams.outputBitstream = ctx->outputSurfaces[i].outputSurface;
> +        picParams.completionEvent = 0;
> +
> +        if (avctx->flags & CODEC_FLAG_INTERLACED_DCT) {
> +            if (frame->top_field_first) {
> +                picParams.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM;
> +            } else {
> +                picParams.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP;
> +            }
> +        } else {
> +            picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
> +        }
> +
> +        picParams.encodePicFlags = 0;
> +        picParams.inputTimeStamp = frame->pts;
> +        picParams.inputDuration = 0;
> +        picParams.codecPicParams.h264PicParams.sliceMode = 
> ctx->encodeConfig.encodeCodecConfig.h264Config.sliceMode;
> +        picParams.codecPicParams.h264PicParams.sliceModeData = 
> ctx->encodeConfig.encodeCodecConfig.h264Config.sliceModeData;
> +        memcpy(&picParams.rcParams, &ctx->encodeConfig.rcParams, 
> sizeof(NV_ENC_RC_PARAMS));
> +
> +        timestamp_list_insert_sorted(&ctx->timestampList, frame->pts);
> +    } else {
> +        picParams.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
> +    }
> +
> +    nvStatus = ff_pNvEnc->nvEncEncodePicture(ctx->nvencoder, &picParams);
> +
> +    if (frame && nvStatus == NV_ENC_ERR_NEED_MORE_INPUT) {
> +        out_surf_queue_push(&ctx->outputSurfaceQueue, 
> &ctx->outputSurfaces[i]);
> +        ctx->outputSurfaces[i].busy = 1;
> +    }
> +
> +    if (nvStatus != NV_ENC_SUCCESS && nvStatus != 
> NV_ENC_ERR_NEED_MORE_INPUT) {
> +        av_log(avctx, AV_LOG_ERROR, "EncodePicture failed!\n");
> +        return AVERROR_EXTERNAL;
> +    }
> +
> +    if (nvStatus != NV_ENC_ERR_NEED_MORE_INPUT) {
> +        while (ctx->outputSurfaceQueue) {
> +            tmpoutsurf = out_surf_queue_pop(&ctx->outputSurfaceQueue);
> +            out_surf_queue_push(&ctx->outputSurfaceReadyQueue, tmpoutsurf);
> +        }
> +
> +        if (frame) {
> +            out_surf_queue_push(&ctx->outputSurfaceReadyQueue, 
> &ctx->outputSurfaces[i]);
> +            ctx->outputSurfaces[i].busy = 1;
> +        }
> +    }
> +
> +    if (ctx->outputSurfaceReadyQueue) {
> +        tmpoutsurf = out_surf_queue_pop(&ctx->outputSurfaceReadyQueue);
> +
> +        *got_packet = process_output_surface(avctx, pkt, avctx->coded_frame, 
> tmpoutsurf);
> +
> +        tmpoutsurf->busy = 0;
> +        av_assert0(tmpoutsurf->inputSurface->lockCount);
> +        tmpoutsurf->inputSurface->lockCount--;
> +    }
> +
> +    return 0;
> +}
> +
> +static int pix_fmts_nvenc_initialized;
> +
> +static enum AVPixelFormat pix_fmts_nvenc[] = {
> +    AV_PIX_FMT_NV12,
> +    AV_PIX_FMT_NONE,
> +    AV_PIX_FMT_NONE,
> +    AV_PIX_FMT_NONE
> +};
> +
> +static av_cold void nvenc_init_static(AVCodec *codec)
> +{
> +    NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS stEncodeSessionParams = { 0 };
> +    CUcontext cuctxcur = 0, cuctx = 0;
> +    NVENCSTATUS nvStatus;
> +    void *nvencoder = 0;
> +    GUID encodeGuid = NV_ENC_CODEC_H264_GUID;
> +    GUID license = dummy_license;
> +    int i = 0, pos = 0;
> +    int gotnv12 = 0, got420 = 0, got444 = 0;
> +    uint32_t inputFmtCount = 32;
> +    NV_ENC_BUFFER_FORMAT inputFmts[32];
> +
> +    for (i = 0; i < 32; ++i)
> +        inputFmts[i] = (NV_ENC_BUFFER_FORMAT)0;
> +    i = 0;
> +
> +    if (pix_fmts_nvenc_initialized) {
> +        codec->pix_fmts = pix_fmts_nvenc;
> +        return;
> +    }
> +
> +    if (!ff_nvenc_dyload_nvenc(0)) {
> +        pix_fmts_nvenc_initialized = 1;
> +        return;
> +    }
> +
> +    stEncodeSessionParams.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
> +    stEncodeSessionParams.apiVersion = NVENCAPI_VERSION;
> +    stEncodeSessionParams.clientKeyPtr = &license;
> +
> +    cuctx = 0;

> +    if (ff_cuCtxCreate(&cuctx, 0, ff_pNvencDevices[ff_iNvencUseDeviceID]) != 
> CUDA_SUCCESS) {

It would probably be better to get ff_cuCtxCreate() return an AVERROR code
instead of a CUDA error code. Same for all ff_ helper functions.

> +        cuctx = 0;
> +        goto error;
> +    }
> +
> +    if (ff_cuCtxPopCurrent(&cuctxcur) != CUDA_SUCCESS)
> +        goto error;
> +
> +    stEncodeSessionParams.device = (void*)cuctx;
> +    stEncodeSessionParams.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
> +
> +    nvStatus = ff_pNvEnc->nvEncOpenEncodeSessionEx(&stEncodeSessionParams, 
> &nvencoder);
> +    if (nvStatus != NV_ENC_SUCCESS) {
> +        nvencoder = 0;
> +        goto error;
> +    }
> +
> +    nvStatus = ff_pNvEnc->nvEncGetInputFormats(nvencoder, encodeGuid, 
> inputFmts, 32, &inputFmtCount);
> +    if (nvStatus != NV_ENC_SUCCESS)
> +        goto error;
> +
> +    pos = 0;
> +    for (i = 0; i < inputFmtCount && pos < 3; ++i) {
> +        if (!gotnv12 && (inputFmts[i] == NV_ENC_BUFFER_FORMAT_NV12_PL
> +                || inputFmts[i] == NV_ENC_BUFFER_FORMAT_NV12_TILED16x16
> +                || inputFmts[i] == NV_ENC_BUFFER_FORMAT_NV12_TILED64x16)) {
> +
> +            pix_fmts_nvenc[pos++] = AV_PIX_FMT_NV12;
> +            gotnv12 = 1;
> +        } else if (!got420 && (inputFmts[i] == NV_ENC_BUFFER_FORMAT_YV12_PL
> +                || inputFmts[i] == NV_ENC_BUFFER_FORMAT_YV12_TILED16x16
> +                || inputFmts[i] == NV_ENC_BUFFER_FORMAT_YV12_TILED64x16)) {
> +
> +            pix_fmts_nvenc[pos++] = AV_PIX_FMT_YUV420P;
> +            got420 = 1;
> +        } else if (!got444 && (inputFmts[i] == NV_ENC_BUFFER_FORMAT_YUV444_PL
> +                || inputFmts[i] == NV_ENC_BUFFER_FORMAT_YUV444_TILED16x16
> +                || inputFmts[i] == NV_ENC_BUFFER_FORMAT_YUV444_TILED64x16)) {
> +
> +            pix_fmts_nvenc[pos++] = AV_PIX_FMT_YUV444P;
> +            got444 = 1;
> +        }
> +    }
> +
> +    pix_fmts_nvenc[pos] = AV_PIX_FMT_NONE;
> +
> +    pix_fmts_nvenc_initialized = 1;
> +    codec->pix_fmts = pix_fmts_nvenc;
> +
> +    ff_pNvEnc->nvEncDestroyEncoder(nvencoder);
> +    ff_cuCtxDestroy(cuctx);
> +
> +    ff_nvenc_unload_nvenc(0);
> +
> +    return;
> +
> +error:
> +
> +    if (nvencoder)
> +        ff_pNvEnc->nvEncDestroyEncoder(nvencoder);
> +
> +    if (cuctx)
> +        ff_cuCtxDestroy(cuctx);
> +
> +    pix_fmts_nvenc_initialized = 1;
> +    pix_fmts_nvenc[0] = AV_PIX_FMT_NV12;
> +    pix_fmts_nvenc[1] = AV_PIX_FMT_NONE;
> +
> +    codec->pix_fmts = pix_fmts_nvenc;
> +
> +    ff_nvenc_unload_nvenc(0);
> +}
> +
> +#define OFFSET(x) offsetof(NvencContext, x)
> +#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
> +static const AVOption options[] = {
> +    { "profile", "Set profile restrictions", OFFSET(profile), 
> AV_OPT_TYPE_STRING, { .str = "high" }, 0, 0, VE},
> +    { "preset", "Set the encoding preset (one of hq, hp, bd, ll, llhq, llhp, 
> default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "hq" }, 0, 0, VE },
> +    { "cqp", "Constant quantization parameter rate control method", 
> OFFSET(cqp), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, VE },
> +    { "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_INT, { .i64 = 
> 0 }, 0, 1, VE },
> +    { "2pass", "Use 2pass cbr encoding mode (low latency mode only)", 
> OFFSET(twopass), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, VE },

Some of these options are redundant with global ones; "profile" already
cited, "2pass" = -flags +pass1/+pass2; "cqp" = "global_quality".

> +    { "goppattern", "Specifies the GOP pattern as follows: 0: I, 1: IPP, 2: 
> IBP, 3: IBBP", OFFSET(gobpattern), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 3, VE 
> },
> +    { NULL }
> +};
> +
> +static const AVClass nvenc_class = {
> +    .class_name = "nvenc",
> +    .item_name = av_default_item_name,
> +    .option = options,
> +    .version = LIBAVUTIL_VERSION_INT,
> +};
> +
> +static const AVCodecDefault nvenc_defaults[] = {
> +    { "b", "0" },
> +    { "qmin", "-1" },
> +    { "qmax", "-1" },
> +    { "qdiff", "-1" },
> +    { "qblur", "-1" },
> +    { "qcomp", "-1" },
> +    { NULL },
> +};
> +
> +AVCodec ff_nvenc_encoder = {
> +    .name = "nvenc",
> +    .long_name = NULL_IF_CONFIG_SMALL("Nvidia NVENC h264 encoder"),
> +    .type = AVMEDIA_TYPE_VIDEO,
> +    .id = AV_CODEC_ID_H264,
> +    .priv_data_size = sizeof(NvencContext),
> +    .init = nvenc_encode_init,
> +    .encode2 = nvenc_encode_frame,
> +    .close = nvenc_encode_close,
> +    .capabilities = CODEC_CAP_DELAY,
> +    .priv_class = &nvenc_class,
> +    .defaults = nvenc_defaults,
> +    .init_static_data = nvenc_init_static
> +};
> diff --git a/libavcodec/nvenc_api.c b/libavcodec/nvenc_api.c
> new file mode 100644
> index 0000000..53d5fa8
> --- /dev/null
> +++ b/libavcodec/nvenc_api.c
> @@ -0,0 +1,275 @@
> +/*
> + * H.264 hardware encoding using nvidia nvenc
> + * Copyright (c) 2014 Timo Rothenpieler <t...@rothenpieler.org>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#ifdef _WIN32
> +#include <windows.h>
> +#else
> +#include <dlfcn.h>
> +#endif
> +
> +#include "libavutil/avassert.h"
> +#include "avcodec.h"
> +#include "internal.h"
> +
> +#include "nvenc_cuda.h"
> +#include "nvenc_api.h"
> +
> +PCUINIT ff_cuInit = 0;
> +PCUDEVICEGETCOUNT ff_cuDeviceGetCount = 0;
> +PCUDEVICEGET ff_cuDeviceGet = 0;
> +PCUDEVICEGETNAME ff_cuDeviceGetName = 0;
> +PCUDEVICECOMPUTECAPABILITY ff_cuDeviceComputeCapability = 0;
> +PCUCTXCREATE ff_cuCtxCreate = 0;
> +PCUCTXPOPCURRENT ff_cuCtxPopCurrent = 0;
> +PCUCTXDESTROY ff_cuCtxDestroy = 0;
> +
> +static int nvenc_init_count;
> +static NV_ENCODE_API_FUNCTION_LIST nvEncFuncs;
> +NV_ENCODE_API_FUNCTION_LIST *ff_pNvEnc = 0;
> +int ff_iNvencDeviceCount = 0;
> +CUdevice ff_pNvencDevices[16];
> +unsigned int ff_iNvencUseDeviceID = 0;
> +
> +#ifdef _WIN32
> +#define LOAD_FUNC(l, s) GetProcAddress(l, s)
> +#define DL_CLOSE_FUNC(l) FreeLibrary(l)
> +static HMODULE cudaLib;
> +static HMODULE nvEncLib;
> +#else
> +#define LOAD_FUNC(l, s) dlsym(l, s)
> +#define DL_CLOSE_FUNC(l) dlclose(l)
> +static void *cudaLib;
> +static void *nvEncLib;
> +#endif
> +

> +#define ifav_log(...) if (avctx) { av_log(__VA_ARGS__); }

Looks strange: why no error message when there is no context?

> +
> +#define CHECK_LOAD_FUNC(t, f, s) \
> +{ \
> +    f = (t)LOAD_FUNC(cudaLib, s); \
> +    if (!f) { \
> +        ifav_log(avctx, AV_LOG_FATAL, "Failed loading %s from CUDA 
> library\n", s); \
> +        goto error; \
> +    } \

> +}

Some compilers choke on that because of the semicolon after the block; for
that reason, it is recommended to use do { ... } while (0).

> +
> +static int nvenc_dyload_cuda(AVCodecContext *avctx)
> +{

> +    if (cudaLib)
> +        return 1;

Thread safe?

> +
> +#if defined(_WIN32)
> +    cudaLib = LoadLibrary(TEXT("nvcuda.dll"));
> +#elif defined(__CYGWIN__)
> +    cudaLib = dlopen("nvcuda.dll", RTLD_LAZY);
> +#else
> +    cudaLib = dlopen("libcuda.so", RTLD_LAZY);
> +#endif
> +
> +    if (!cudaLib) {
> +        ifav_log(avctx, AV_LOG_FATAL, "Failed loading CUDA library\n");
> +        goto error;
> +    }
> +

> +    CHECK_LOAD_FUNC(PCUINIT, ff_cuInit, "cuInit");
> +    CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, ff_cuDeviceGetCount, 
> "cuDeviceGetCount");
> +    CHECK_LOAD_FUNC(PCUDEVICEGET, ff_cuDeviceGet, "cuDeviceGet");
> +    CHECK_LOAD_FUNC(PCUDEVICEGETNAME, ff_cuDeviceGetName, "cuDeviceGetName");
> +    CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, 
> ff_cuDeviceComputeCapability, "cuDeviceComputeCapability");
> +    CHECK_LOAD_FUNC(PCUCTXCREATE, ff_cuCtxCreate, "cuCtxCreate_v2");
> +    CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, ff_cuCtxPopCurrent, 
> "cuCtxPopCurrent_v2");
> +    CHECK_LOAD_FUNC(PCUCTXDESTROY, ff_cuCtxDestroy, "cuCtxDestroy_v2");

You could almost use #name and ff_##name to avoid duplicating the parameter.

> +
> +    return 1;
> +
> +error:
> +
> +    if (cudaLib)
> +        DL_CLOSE_FUNC(cudaLib);
> +
> +    cudaLib = 0;
> +
> +    return 0;
> +}
> +
> +static int checkCudaErrors(AVCodecContext *avctx, CUresult err, const char 
> *func)
> +{
> +    if (err != CUDA_SUCCESS) {

> +        ifav_log(avctx, AV_LOG_FATAL, ">> %s - failed with error code 
> 0x%x\n", func, err);

The library does not provide error code -> string utility?

> +        return 0;
> +    }
> +    return 1;
> +}
> +#define checkCudaErrors(f) if (!checkCudaErrors(avctx, f, #f)) goto error
> +
> +static int nvenc_check_cuda(AVCodecContext *avctx)
> +{
> +    int deviceCount = 0;
> +    CUdevice cuDevice = 0;
> +    char gpu_name[128];
> +    int SMminor = 0, SMmajor = 0;
> +    int i, smver;
> +
> +    if (!nvenc_dyload_cuda(avctx))
> +        return 0;
> +
> +    if (ff_iNvencDeviceCount > 0)
> +        return 1;
> +
> +    checkCudaErrors(ff_cuInit(0));
> +
> +    checkCudaErrors(ff_cuDeviceGetCount(&deviceCount));
> +
> +    if (!deviceCount) {
> +        ifav_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n");
> +        goto error;
> +    }
> +
> +    ifav_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", 
> deviceCount);
> +
> +    ff_iNvencDeviceCount = 0;
> +
> +    for (i = 0; i < deviceCount; ++i) {
> +        checkCudaErrors(ff_cuDeviceGet(&cuDevice, i));

> +        checkCudaErrors(ff_cuDeviceGetName(gpu_name, 128, cuDevice));

sizeof(gpu_name), to avoid desync errors.

> +        checkCudaErrors(ff_cuDeviceComputeCapability(&SMmajor, &SMminor, 
> cuDevice));
> +
> +        smver = (SMmajor << 4) | SMminor;
> +
> +        ifav_log(avctx, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM 
> %d.%d, NVENC %s ]\n", i, gpu_name, SMmajor, SMminor, (smver >= 0x30) ? 
> "Available" : "Not Available");
> +
> +        if (smver >= 0x30)
> +            ff_pNvencDevices[ff_iNvencDeviceCount++] = cuDevice;
> +    }
> +
> +    if (!ff_iNvencDeviceCount) {
> +        ifav_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n");
> +        goto error;
> +    }
> +
> +    return 1;
> +
> +error:
> +
> +    ff_iNvencDeviceCount = 0;
> +
> +    return 0;
> +}
> +
> +av_cold int ff_nvenc_dyload_nvenc(AVCodecContext *avctx)
> +{
> +    PNVENCODEAPICREATEINSTANCE nvEncodeAPICreateInstance = 0;
> +    NVENCSTATUS nvstatus;
> +
> +    if (!nvenc_check_cuda(avctx))
> +        return 0;
> +
> +    if (ff_pNvEnc) {
> +        nvenc_init_count++;
> +        return 1;
> +    }
> +
> +#if defined(_WIN32)
> +    if (sizeof(void*) == 8) {
> +        nvEncLib = LoadLibrary(TEXT("nvEncodeAPI64.dll"));
> +    } else {
> +        nvEncLib = LoadLibrary(TEXT("nvEncodeAPI.dll"));
> +    }
> +#elif defined(__CYGWIN__)
> +    if (sizeof(void*) == 8) {
> +        nvEncLib = dlopen("nvEncodeAPI64.dll", RTLD_LAZY);
> +    } else {
> +        nvEncLib = dlopen("nvEncodeAPI.dll", RTLD_LAZY);
> +    }
> +#else
> +    nvEncLib = dlopen("libnvidia-encode.so", RTLD_LAZY);
> +#endif
> +
> +    if (!nvEncLib) {
> +        ifav_log(avctx, AV_LOG_FATAL, "Failed loading the nvenc library\n");
> +        goto error;
> +    }
> +
> +    nvEncodeAPICreateInstance = 
> (PNVENCODEAPICREATEINSTANCE)LOAD_FUNC(nvEncLib, "NvEncodeAPICreateInstance");
> +
> +    if (!nvEncodeAPICreateInstance) {
> +        ifav_log(avctx, AV_LOG_FATAL, "Failed to load nvenc entrypoint\n");
> +        goto error;
> +    }
> +
> +    ff_pNvEnc = &nvEncFuncs;
> +    memset(ff_pNvEnc, 0, sizeof(NV_ENCODE_API_FUNCTION_LIST));
> +    ff_pNvEnc->version = NV_ENCODE_API_FUNCTION_LIST_VER;
> +
> +    nvstatus = nvEncodeAPICreateInstance(ff_pNvEnc);
> +
> +    if (nvstatus != NV_ENC_SUCCESS) {
> +        ifav_log(avctx, AV_LOG_FATAL, "Failed to create nvenc instance\n");
> +        goto error;
> +    }
> +
> +    ifav_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
> +
> +    nvenc_init_count = 1;
> +
> +    return 1;
> +
> +error:
> +    if (nvEncLib)
> +        DL_CLOSE_FUNC(nvEncLib);
> +
> +    nvEncLib = 0;
> +    ff_pNvEnc = 0;
> +    nvenc_init_count = 0;
> +
> +    return 0;
> +}
> +
> +av_cold void ff_nvenc_unload_nvenc(AVCodecContext *avctx)
> +{

> +    if (nvenc_init_count <= 0)
> +        return;
> +
> +    nvenc_init_count--;

This looks not thread safe.

> +
> +    if (nvenc_init_count > 0)
> +        return;
> +
> +    DL_CLOSE_FUNC(nvEncLib);
> +    nvEncLib = 0;
> +    ff_pNvEnc = 0;
> +
> +    ff_iNvencDeviceCount = 0;
> +
> +    DL_CLOSE_FUNC(cudaLib);
> +    cudaLib = 0;
> +
> +    ff_cuInit = 0;
> +    ff_cuDeviceGetCount = 0;
> +    ff_cuDeviceGet = 0;
> +    ff_cuDeviceGetName = 0;
> +    ff_cuDeviceComputeCapability = 0;
> +    ff_cuCtxCreate = 0;
> +    ff_cuCtxPopCurrent = 0;
> +    ff_cuCtxDestroy = 0;
> +
> +    ifav_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n");
> +}
> diff --git a/libavcodec/nvenc_api.h b/libavcodec/nvenc_api.h
> new file mode 100644
> index 0000000..16b1c72
> --- /dev/null
> +++ b/libavcodec/nvenc_api.h
> @@ -0,0 +1,35 @@
> +/*
> + * H.264 hardware encoding using nvidia nvenc
> + * Copyright (c) 2014 Timo Rothenpieler <t...@rothenpieler.org>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#ifndef AVCODEC_NVENC_API_H
> +#define AVCODEC_NVENC_API_H
> +
> +#include <nvEncodeAPI.h>
> +
> +
> +typedef NVENCSTATUS (NVENCAPI* 
> PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
> +
> +extern NV_ENCODE_API_FUNCTION_LIST *ff_pNvEnc;
> +
> +int ff_nvenc_dyload_nvenc(AVCodecContext *avctx);
> +void ff_nvenc_unload_nvenc(AVCodecContext *avctx);
> +
> +#endif
> diff --git a/libavcodec/nvenc_cuda.h b/libavcodec/nvenc_cuda.h
> new file mode 100644
> index 0000000..ae43a22
> --- /dev/null
> +++ b/libavcodec/nvenc_cuda.h
> @@ -0,0 +1,62 @@
> +/*
> + * H.264 hardware encoding using nvidia nvenc
> + * Copyright (c) 2014 Timo Rothenpieler <t...@rothenpieler.org>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#ifndef AVCODEC_NVENC_CUDA_H
> +#define AVCODEC_NVENC_CUDA_H
> +
> +typedef enum cudaError_enum {
> +    CUDA_SUCCESS = 0
> +} CUresult;
> +typedef int CUdevice;
> +typedef void* CUcontext;
> +
> +#ifdef _WIN32
> +#define CUDAAPI __stdcall
> +#else
> +#define CUDAAPI
> +#endif
> +
> +typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
> +typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
> +typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
> +typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice 
> dev);
> +typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int 
> *minor, CUdevice dev);
> +typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, 
> CUdevice dev);
> +typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
> +typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
> +
> +extern PCUINIT ff_cuInit;
> +extern PCUDEVICEGETCOUNT ff_cuDeviceGetCount;
> +extern PCUDEVICEGET ff_cuDeviceGet;
> +extern PCUDEVICEGETNAME ff_cuDeviceGetName;
> +extern PCUDEVICECOMPUTECAPABILITY ff_cuDeviceComputeCapability;
> +extern PCUCTXCREATE ff_cuCtxCreate;
> +extern PCUCTXPOPCURRENT ff_cuCtxPopCurrent;
> +extern PCUCTXDESTROY ff_cuCtxDestroy;
> +
> +int ff_nvenc_dyload_cuda(AVCodecContext *avctx);
> +int ff_nvenc_check_cuda(AVCodecContext *avctx);
> +
> +extern int ff_iNvencDeviceCount;
> +extern CUdevice ff_pNvencDevices[16];
> +extern unsigned int ff_iNvencUseDeviceID;
> +
> +#endif

Regards,

-- 
  Nicolas George

signature.asc
Description: Digital signature

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] Add NVENC encoder

Reply via email to