ffmpeg | branch: master | James Almer <jamr...@gmail.com> | Fri Nov 10 20:38:36 2017 -0300| [1178babacaaad4e65fcb28af447afd586429c51a] | committer: James Almer
Merge commit 'b90fdb2c7199cc8b0e8d994fafba1fb4dc181d88' * commit 'b90fdb2c7199cc8b0e8d994fafba1fb4dc181d88': hevcdec: add a CUVID hwaccel Adapted for ffmpeg by Timo Rothenpieler. Merged-by: James Almer <jamr...@gmail.com> > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1178babacaaad4e65fcb28af447afd586429c51a --- Changelog | 2 +- configure | 2 + libavcodec/Makefile | 1 + libavcodec/allcodecs.c | 1 + libavcodec/hevcdec.c | 7 ++ libavcodec/nvdec.c | 1 + libavcodec/nvdec_hevc.c | 279 ++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 292 insertions(+), 1 deletion(-) diff --git a/Changelog b/Changelog index ba2951db25..cd4a841619 100644 --- a/Changelog +++ b/Changelog @@ -13,7 +13,7 @@ version <next>: - PCE support for extended channel layouts in the AAC encoder - native aptX encoder and decoder - Raw aptX muxer and demuxer -- NVIDIA NVDEC-accelerated H.264 hwaccel decoding +- NVIDIA NVDEC-accelerated H.264 and HEVC hwaccel decoding version 3.4: diff --git a/configure b/configure index 1b90d8e9a1..2cf18ecc12 100755 --- a/configure +++ b/configure @@ -2692,6 +2692,8 @@ hevc_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_HEVC" hevc_d3d11va2_hwaccel_select="hevc_decoder" hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" hevc_dxva2_hwaccel_select="hevc_decoder" +hevc_nvdec_hwaccel_deps="cuda nvdec" +hevc_nvdec_hwaccel_select="hevc_decoder" hevc_qsv_hwaccel_deps="libmfx" hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" hevc_vaapi_hwaccel_select="hevc_decoder" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index db1f70784a..7ac4e13a06 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -846,6 +846,7 @@ OBJS-$(CONFIG_H264_VDPAU_HWACCEL) += vdpau_h264.o OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o +OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec_h2645.o OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index c58f99c176..c817003693 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -79,6 +79,7 @@ static void register_all(void) REGISTER_HWACCEL(HEVC_D3D11VA, hevc_d3d11va); REGISTER_HWACCEL(HEVC_D3D11VA2, hevc_d3d11va2); REGISTER_HWACCEL(HEVC_DXVA2, hevc_dxva2); + REGISTER_HWACCEL(HEVC_NVDEC, hevc_nvdec); REGISTER_HWACCEL(HEVC_MEDIACODEC, hevc_mediacodec); REGISTER_HWACCEL(HEVC_QSV, hevc_qsv); REGISTER_HWACCEL(HEVC_VAAPI, hevc_vaapi); diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c index 403a8cf454..6dd6d0c53c 100644 --- a/libavcodec/hevcdec.c +++ b/libavcodec/hevcdec.c @@ -354,6 +354,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) { #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ + CONFIG_HEVC_NVDEC_HWACCEL + \ CONFIG_HEVC_VAAPI_HWACCEL + \ CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ CONFIG_HEVC_VDPAU_HWACCEL) @@ -375,6 +376,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) #if CONFIG_HEVC_VDPAU_HWACCEL *fmt++ = AV_PIX_FMT_VDPAU; #endif +#if CONFIG_HEVC_NVDEC_HWACCEL + *fmt++ = AV_PIX_FMT_CUDA; +#endif #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; #endif @@ -393,6 +397,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; #endif +#if CONFIG_HEVC_NVDEC_HWACCEL + *fmt++ = AV_PIX_FMT_CUDA; +#endif break; } diff --git a/libavcodec/nvdec.c b/libavcodec/nvdec.c index 9ca9faa378..ab66b91a92 100644 --- a/libavcodec/nvdec.c +++ b/libavcodec/nvdec.c @@ -53,6 +53,7 @@ static int map_avcodec_id(enum AVCodecID id) { switch (id) { case AV_CODEC_ID_H264: return cudaVideoCodec_H264; + case AV_CODEC_ID_HEVC: return cudaVideoCodec_HEVC; } return -1; } diff --git a/libavcodec/nvdec_hevc.c b/libavcodec/nvdec_hevc.c new file mode 100644 index 0000000000..3c40ab2bea --- /dev/null +++ b/libavcodec/nvdec_hevc.c @@ -0,0 +1,279 @@ +/* + * HEVC HW decode acceleration through NVDEC + * + * Copyright (c) 2017 Anton Khirnov + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> +#include <string.h> + +#include "avcodec.h" +#include "nvdec.h" +#include "decode.h" +#include "internal.h" +#include "hevcdec.h" +#include "hevc_data.h" + +static void dpb_add(CUVIDHEVCPICPARAMS *pp, int idx, const HEVCFrame *src) +{ + FrameDecodeData *fdd = (FrameDecodeData*)src->frame->private_ref->data; + const NVDECFrame *cf = fdd->hwaccel_priv; + + pp->RefPicIdx[idx] = cf ? cf->idx : -1; + pp->PicOrderCntVal[idx] = src->poc; + pp->IsLongTerm[idx] = !!(src->flags & HEVC_FRAME_FLAG_LONG_REF); +} + +static void fill_scaling_lists(CUVIDHEVCPICPARAMS *ppc, const HEVCContext *s) +{ + const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ? + &s->ps.pps->scaling_list : &s->ps.sps->scaling_list; + int i, j, pos; + + for (i = 0; i < 6; i++) { + for (j = 0; j < 16; j++) { + pos = 4 * ff_hevc_diag_scan4x4_y[j] + ff_hevc_diag_scan4x4_x[j]; + ppc->ScalingList4x4[i][j] = sl->sl[0][i][pos]; + } + + for (j = 0; j < 64; j++) { + pos = 8 * ff_hevc_diag_scan8x8_y[j] + ff_hevc_diag_scan8x8_x[j]; + ppc->ScalingList8x8[i][j] = sl->sl[1][i][pos]; + ppc->ScalingList16x16[i][j] = sl->sl[2][i][pos]; + + if (i < 2) + ppc->ScalingList32x32[i][j] = sl->sl[3][i][pos]; + } + } + + memcpy(ppc->ScalingListDCCoeff16x16, sl->sl_dc[0], sizeof(ppc->ScalingListDCCoeff16x16)); + memcpy(ppc->ScalingListDCCoeff32x32, sl->sl_dc[1], sizeof(ppc->ScalingListDCCoeff32x32)); +} + +static int nvdec_hevc_start_frame(AVCodecContext *avctx, + const uint8_t *buffer, uint32_t size) +{ + const HEVCContext *s = avctx->priv_data; + const HEVCPPS *pps = s->ps.pps; + const HEVCSPS *sps = s->ps.sps; + + NVDECContext *ctx = avctx->internal->hwaccel_priv_data; + CUVIDPICPARAMS *pp = &ctx->pic_params; + CUVIDHEVCPICPARAMS *ppc = &pp->CodecSpecific.hevc; + FrameDecodeData *fdd; + NVDECFrame *cf; + + int i, j, dpb_size, ret; + + ret = ff_nvdec_start_frame(avctx, s->ref->frame); + if (ret < 0) + return ret; + + fdd = (FrameDecodeData*)s->ref->frame->private_ref->data; + cf = (NVDECFrame*)fdd->hwaccel_priv; + + *pp = (CUVIDPICPARAMS) { + .PicWidthInMbs = sps->width / 16, + .FrameHeightInMbs = sps->height / 16, + .CurrPicIdx = cf->idx, + .ref_pic_flag = 1, + .intra_pic_flag = 0, + + .CodecSpecific.hevc = { + .pic_width_in_luma_samples = sps->width, + .pic_height_in_luma_samples = sps->height, + .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3, + .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size, + .log2_min_transform_block_size_minus2 = sps->log2_min_tb_size - 2, + .log2_diff_max_min_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size, + .pcm_enabled_flag = sps->pcm_enabled_flag, + .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm_enabled_flag ? sps->pcm.log2_min_pcm_cb_size - 3 : 0, + .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size, + .pcm_sample_bit_depth_luma_minus1 = sps->pcm_enabled_flag ? sps->pcm.bit_depth - 1 : 0, + .pcm_sample_bit_depth_chroma_minus1 = sps->pcm_enabled_flag ? sps->pcm.bit_depth_chroma - 1 : 0, + .pcm_loop_filter_disabled_flag = sps->pcm.loop_filter_disable_flag, + .strong_intra_smoothing_enabled_flag = sps->sps_strong_intra_smoothing_enable_flag, + .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra, + .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter, + .amp_enabled_flag = sps->amp_enabled_flag, + .separate_colour_plane_flag = sps->separate_colour_plane_flag, + .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, + .num_short_term_ref_pic_sets = sps->nb_st_rps, + .long_term_ref_pics_present_flag = sps->long_term_ref_pics_present_flag, + .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps, + .sps_temporal_mvp_enabled_flag = sps->sps_temporal_mvp_enabled_flag, + .sample_adaptive_offset_enabled_flag = sps->sao_enabled, + .scaling_list_enable_flag = sps->scaling_list_enable_flag, + .IrapPicFlag = IS_IRAP(s), + .IdrPicFlag = IS_IDR(s), + .bit_depth_luma_minus8 = sps->bit_depth - 8, + .bit_depth_chroma_minus8 = sps->bit_depth - 8, + + .dependent_slice_segments_enabled_flag = pps->dependent_slice_segments_enabled_flag, + .slice_segment_header_extension_present_flag = pps->slice_header_extension_present_flag, + .sign_data_hiding_enabled_flag = pps->sign_data_hiding_flag, + .cu_qp_delta_enabled_flag = pps->cu_qp_delta_enabled_flag, + .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth, + .init_qp_minus26 = pps->pic_init_qp_minus26, + .pps_cb_qp_offset = pps->cb_qp_offset, + .pps_cr_qp_offset = pps->cr_qp_offset, + .constrained_intra_pred_flag = pps->constrained_intra_pred_flag, + .weighted_pred_flag = pps->weighted_pred_flag, + .weighted_bipred_flag = pps->weighted_bipred_flag, + .transform_skip_enabled_flag = pps->transform_skip_enabled_flag, + .transquant_bypass_enabled_flag = pps->transquant_bypass_enable_flag, + .entropy_coding_sync_enabled_flag = pps->entropy_coding_sync_enabled_flag, + .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2, + .num_extra_slice_header_bits = pps->num_extra_slice_header_bits, + .loop_filter_across_tiles_enabled_flag = pps->loop_filter_across_tiles_enabled_flag, + .loop_filter_across_slices_enabled_flag = pps->seq_loop_filter_across_slices_enabled_flag, + .output_flag_present_flag = pps->output_flag_present_flag, + .num_ref_idx_l0_default_active_minus1 = pps->num_ref_idx_l0_default_active - 1, + .num_ref_idx_l1_default_active_minus1 = pps->num_ref_idx_l1_default_active - 1, + .lists_modification_present_flag = pps->lists_modification_present_flag, + .cabac_init_present_flag = pps->cabac_init_present_flag, + .pps_slice_chroma_qp_offsets_present_flag = pps->pic_slice_level_chroma_qp_offsets_present_flag, + .deblocking_filter_override_enabled_flag = pps->deblocking_filter_override_enabled_flag, + .pps_deblocking_filter_disabled_flag = pps->disable_dbf, + .pps_beta_offset_div2 = pps->beta_offset / 2, + .pps_tc_offset_div2 = pps->tc_offset / 2, + .tiles_enabled_flag = pps->tiles_enabled_flag, + .uniform_spacing_flag = pps->uniform_spacing_flag, + .num_tile_columns_minus1 = pps->num_tile_columns - 1, + .num_tile_rows_minus1 = pps->num_tile_rows - 1, + + .NumBitsForShortTermRPSInSlice = s->sh.short_term_rps ? s->sh.short_term_ref_pic_set_size : 0, + .NumDeltaPocsOfRefRpsIdx = s->sh.short_term_rps ? s->sh.short_term_rps->rps_idx_num_delta_pocs : 0, + .NumPocTotalCurr = s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs + + s->rps[LT_CURR].nb_refs, + .NumPocStCurrBefore = s->rps[ST_CURR_BEF].nb_refs, + .NumPocStCurrAfter = s->rps[ST_CURR_AFT].nb_refs, + .NumPocLtCurr = s->rps[LT_CURR].nb_refs, + .CurrPicOrderCntVal = s->ref->poc, + }, + }; + + if (pps->num_tile_columns > FF_ARRAY_ELEMS(ppc->column_width_minus1) || + pps->num_tile_rows > FF_ARRAY_ELEMS(ppc->row_height_minus1)) { + av_log(avctx, AV_LOG_ERROR, "Too many tiles\n"); + return AVERROR(ENOSYS); + } + for (i = 0; i < pps->num_tile_columns; i++) + ppc->column_width_minus1[i] = pps->column_width[i] - 1; + for (i = 0; i < pps->num_tile_rows; i++) + ppc->row_height_minus1[i] = pps->row_height[i] - 1; + + if (s->rps[LT_CURR].nb_refs > FF_ARRAY_ELEMS(ppc->RefPicSetLtCurr) || + s->rps[ST_CURR_BEF].nb_refs > FF_ARRAY_ELEMS(ppc->RefPicSetStCurrBefore) || + s->rps[ST_CURR_AFT].nb_refs > FF_ARRAY_ELEMS(ppc->RefPicSetStCurrAfter)) { + av_log(avctx, AV_LOG_ERROR, "Too many reference frames\n"); + return AVERROR(ENOSYS); + } + + dpb_size = 0; + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + const HEVCFrame *ref = &s->DPB[i]; + if (!(ref->flags & (HEVC_FRAME_FLAG_SHORT_REF | HEVC_FRAME_FLAG_LONG_REF))) + continue; + if (dpb_size >= FF_ARRAY_ELEMS(ppc->RefPicIdx)) { + av_log(avctx, AV_LOG_ERROR, "Too many reference frames\n"); + return AVERROR_INVALIDDATA; + } + dpb_add(ppc, dpb_size++, ref); + + } + for (i = dpb_size; i < FF_ARRAY_ELEMS(ppc->RefPicIdx); i++) + ppc->RefPicIdx[i] = -1; + + for (i = 0; i < s->rps[ST_CURR_BEF].nb_refs; i++) { + for (j = 0; j < dpb_size; j++) { + if (ppc->PicOrderCntVal[j] == s->rps[ST_CURR_BEF].list[i]) { + ppc->RefPicSetStCurrBefore[i] = j; + break; + } + } + } + for (i = 0; i < s->rps[ST_CURR_AFT].nb_refs; i++) { + for (j = 0; j < dpb_size; j++) { + if (ppc->PicOrderCntVal[j] == s->rps[ST_CURR_AFT].list[i]) { + ppc->RefPicSetStCurrAfter[i] = j; + break; + } + } + } + for (i = 0; i < s->rps[LT_CURR].nb_refs; i++) { + for (j = 0; j < dpb_size; j++) { + if (ppc->PicOrderCntVal[j] == s->rps[LT_CURR].list[i]) { + ppc->RefPicSetLtCurr[i] = j; + break; + } + } + } + + fill_scaling_lists(ppc, s); + + return 0; +} + +static int nvdec_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, + uint32_t size) +{ + NVDECContext *ctx = avctx->internal->hwaccel_priv_data; + void *tmp; + + tmp = av_fast_realloc(ctx->bitstream, &ctx->bitstream_allocated, + ctx->bitstream_len + size + 3); + if (!tmp) + return AVERROR(ENOMEM); + ctx->bitstream = tmp; + + tmp = av_fast_realloc(ctx->slice_offsets, &ctx->slice_offsets_allocated, + (ctx->nb_slices + 1) * sizeof(*ctx->slice_offsets)); + if (!tmp) + return AVERROR(ENOMEM); + ctx->slice_offsets = tmp; + + AV_WB24(ctx->bitstream + ctx->bitstream_len, 1); + memcpy(ctx->bitstream + ctx->bitstream_len + 3, buffer, size); + ctx->slice_offsets[ctx->nb_slices] = ctx->bitstream_len ; + ctx->bitstream_len += size + 3; + ctx->nb_slices++; + + return 0; +} + +static int nvdec_hevc_decode_init(AVCodecContext *avctx) +{ + const HEVCContext *s = avctx->priv_data; + const HEVCSPS *sps = s->ps.sps; + return ff_nvdec_decode_init(avctx, sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + 1); +} + +AVHWAccel ff_hevc_nvdec_hwaccel = { + .name = "hevc_nvdec", + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_HEVC, + .pix_fmt = AV_PIX_FMT_CUDA, + .start_frame = nvdec_hevc_start_frame, + .end_frame = ff_nvdec_end_frame, + .decode_slice = nvdec_hevc_decode_slice, + .init = nvdec_hevc_decode_init, + .uninit = ff_nvdec_decode_uninit, + .priv_data_size = sizeof(NVDECContext), +}; ====================================================================== diff --cc Changelog index ba2951db25,92a72490a9..cd4a841619 --- a/Changelog +++ b/Changelog @@@ -2,362 -2,37 +2,362 @@@ Entries are sorted chronologically fro releases are sorted from youngest to oldest. version <next>: -- Support for spherical videos -- Intel QSV-accelerated VP8 and VC-1 decoding -- VAAPI-accelerated VP8 and HEVC decoding -- VAAPI-accelerated deinterlacing -- config.log and other configuration files moved into avbuild/ directory -- VAAPI-accelerated MPEG-2 and VP8 encoding -- Apple Pixlet decoder +- Bitstream filters for editing metadata in H.264, HEVC and MPEG-2 streams +- Dropped support for OpenJPEG versions 2.0 and below. Using OpenJPEG now + requires 2.1 (or later) and pkg-config. +- VDA dropped (use VideoToolbox instead) +- MagicYUV encoder +- Raw AMR-NB and AMR-WB demuxers +- TiVo ty/ty+ demuxer +- Intel QSV-accelerated MJPEG encoding +- PCE support for extended channel layouts in the AAC encoder +- native aptX encoder and decoder +- Raw aptX muxer and demuxer - - NVIDIA NVDEC-accelerated H.264 hwaccel decoding ++- NVIDIA NVDEC-accelerated H.264 and HEVC hwaccel decoding + + +version 3.4: +- deflicker video filter +- doubleweave video filter +- lumakey video filter +- pixscope video filter +- oscilloscope video filter +- config.log and other configuration files moved into ffbuild/ directory +- update cuvid/nvenc headers to Video Codec SDK 8.0.14 +- afir audio filter +- scale_cuda CUDA based video scale filter +- librsvg support for svg rasterization +- crossfeed audio filter +- spec compliant VP9 muxing support in MP4 +- remove the libnut muxer/demuxer wrappers +- remove the libschroedinger encoder/decoder wrappers +- surround audio filter +- sofalizer filter switched to libmysofa +- Gremlin Digital Video demuxer and decoder +- headphone audio filter +- superequalizer audio filter +- roberts video filter - The x86 assembler default switched from yasm to nasm, pass --x86asmexe=yasm to configure to restore the old behavior. -- Cineform HD decoder -- VP9 superframe split/merge bitstream filters +- additional frame format support for Interplay MVE movies +- support for decoding through D3D11VA in ffmpeg +- limiter video filter +- libvmaf video filter +- Dolby E decoder and SMPTE 337M demuxer +- unpremultiply video filter +- tlut2 video filter +- floodfill video filter +- pseudocolor video filter +- raw G.726 muxer and demuxer, left- and right-justified +- NewTek NDI input/output device +- Some video filters with several inputs now use a common set of options: + blend, libvmaf, lut3d, overlay, psnr, ssim. + They must always be used by name. +- FITS demuxer and decoder +- FITS muxer and encoder +- add --disable-autodetect build switch +- drop deprecated qtkit input device (use avfoundation instead) +- despill video filter +- haas audio filter +- SUP/PGS subtitle muxer +- convolve video filter +- VP9 tile threading support +- KMS screen grabber +- CUDA thumbnail filter +- V4L2 mem2mem HW assisted codecs +- Rockchip MPP hardware decoding +- vmafmotion video filter +- use MIME type "G726" for little-endian G.726, "AAL2-G726" for big-endian G.726 + + +version 3.3: +- CrystalHD decoder moved to new decode API +- add internal ebur128 library, remove external libebur128 dependency +- Pro-MPEG CoP #3-R2 FEC protocol +- premultiply video filter +- Support for spherical videos +- configure now fails if autodetect-libraries are requested but not found +- PSD Decoder +- 16.8 floating point pcm decoder +- 24.0 floating point pcm decoder +- Apple Pixlet decoder +- QDMC audio decoder +- NewTek SpeedHQ decoder +- MIDI Sample Dump Standard demuxer +- readeia608 filter +- Sample Dump eXchange demuxer +- abitscope multimedia filter +- Scenarist Closed Captions demuxer and muxer +- threshold filter +- midequalizer filter +- Optimal Huffman tables for (M)JPEG encoding +- VAAPI-accelerated MPEG-2 and VP8 encoding - FM Screen Capture Codec decoder -- ClearVideo decoder (I-frames only) -- support for decoding through D3D11VA in avconv -- Cinepak encoder -- Intel QSV-accelerated MJPEG encoding -- NVIDIA CUVID-accelerated H.264 and HEVC decoding +- native Opus encoder +- ScreenPressor decoder +- incomplete ClearVideo decoder +- Intel QSV video scaling and deinterlacing filters +- Support MOV with multiple sample description tables +- XPM decoder +- Removed the legacy X11 screen grabber, use XCB instead +- MPEG-7 Video Signature filter +- Removed asyncts filter (use af_aresample instead) +- Intel QSV-accelerated VP8 video decoding +- VAAPI-accelerated deinterlacing -version 12: -- aliases and defaults for Ogg subtypes (opus, spx) -- HEVC/H.265 RTP payload format (draft v6) packetizer and depacketizer -- avplay now exits by default at the end of playback -- XCB-based screen-grabber -- creating DASH compatible fragmented MP4, MPEG-DASH segmenting muxer -- H.261 RTP payload format (RFC 4587) depacketizer and experimental packetizer +version 3.2: +- libopenmpt demuxer +- tee protocol +- Changed metadata print option to accept general urls +- Alias muxer for Ogg Video (.ogv) +- VP8 in Ogg muxing +- curves filter doesn't automatically insert points at x=0 and x=1 anymore +- 16-bit support in curves filter and selectivecolor filter +- OpenH264 decoder wrapper +- MediaCodec H.264/HEVC/MPEG-4/VP8/VP9 hwaccel +- True Audio (TTA) muxer +- crystalizer audio filter +- acrusher audio filter +- bitplanenoise video filter +- floating point support in als decoder +- fifo muxer +- maskedclamp filter +- hysteresis filter +- lut2 filter +- yuvtestsrc filter +- CUDA CUVID H.263/VP8/VP9/10 bit HEVC (Dithered) Decoding +- vaguedenoiser filter +- added threads option per filter instance +- weave filter +- gblur filter +- avgblur filter +- sobel and prewitt filter +- MediaCodec HEVC/MPEG-4/VP8/VP9 decoding +- Meridian Lossless Packing (MLP) / TrueHD encoder +- Non-Local Means (nlmeans) denoising filter +- sdl2 output device and ffplay support +- sdl1 output device and sdl1 support removed +- extended mov edit list support +- libfaac encoder removed +- Matroska muxer now writes CRC32 elements by default in all Level 1 elements +- sidedata video and asidedata audio filter +- Changed mapping of rtp MIME type G726 to codec g726le. +- spec compliant VAAPI/DXVA2 VC-1 decoding of slices in frame-coded images + + +version 3.1: +- DXVA2-accelerated HEVC Main10 decoding +- fieldhint filter +- loop video filter and aloop audio filter +- Bob Weaver deinterlacing filter +- firequalizer filter +- datascope filter +- bench and abench filters +- ciescope filter +- protocol blacklisting API +- MediaCodec H264 decoding +- VC-2 HQ RTP payload format (draft v1) depacketizer and packetizer +- VP9 RTP payload format (draft v2) packetizer +- AudioToolbox audio decoders +- AudioToolbox audio encoders +- coreimage filter (GPU based image filtering on OSX) +- libdcadec removed +- bitstream filter for extracting DTS core +- ADPCM IMA DAT4 decoder +- musx demuxer +- aix demuxer +- remap filter +- hash and framehash muxers +- colorspace filter +- hdcd filter +- readvitc filter +- VAAPI-accelerated format conversion and scaling +- libnpp/CUDA-accelerated format conversion and scaling +- Duck TrueMotion 2.0 Real Time decoder +- Wideband Single-bit Data (WSD) demuxer +- VAAPI-accelerated H.264/HEVC/MJPEG encoding +- DTS Express (LBR) decoder +- Generic OpenMAX IL encoder with support for Raspberry Pi +- IFF ANIM demuxer & decoder +- Direct Stream Transfer (DST) decoder +- loudnorm filter +- MTAF demuxer and decoder +- MagicYUV decoder +- OpenExr improvements (tile data and B44/B44A support) +- BitJazz SheerVideo decoder +- CUDA CUVID H264/HEVC decoder +- 10-bit depth support in native utvideo decoder +- libutvideo wrapper removed +- YUY2 Lossless Codec decoder +- VideoToolbox H.264 encoder + + +version 3.0: +- Common Encryption (CENC) MP4 encoding and decoding support +- DXV decoding +- extrastereo filter +- ocr filter +- alimiter filter +- stereowiden filter +- stereotools filter +- rubberband filter +- tremolo filter +- agate filter +- chromakey filter +- maskedmerge filter +- Screenpresso SPV1 decoding +- chromaprint fingerprinting muxer +- ffplay dynamic volume control +- displace filter +- selectivecolor filter +- extensive native AAC encoder improvements and removal of experimental flag +- ADPCM PSX decoder +- 3dostr, dcstr, fsb, genh, vag, xvag, ads, msf, svag & vpk demuxer +- zscale filter +- wve demuxer +- zero-copy Intel QSV transcoding in ffmpeg +- shuffleframes filter +- SDX2 DPCM decoder +- vibrato filter +- innoHeim/Rsupport Screen Capture Codec decoder +- ADPCM AICA decoder +- Interplay ACM demuxer and audio decoder +- XMA1 & XMA2 decoder +- realtime filter +- anoisesrc audio filter source +- IVR demuxer +- compensationdelay filter +- acompressor filter +- support encoding 16-bit RLE SGI images +- apulsator filter +- sidechaingate audio filter +- mipsdspr1 option has been renamed to mipsdsp +- aemphasis filter +- mips32r5 option has been removed +- mips64r6 option has been removed +- DXVA2-accelerated VP9 decoding +- SOFAlizer: virtual binaural acoustics filter +- VAAPI VP9 hwaccel +- audio high-order multiband parametric equalizer +- automatic bitstream filtering +- showspectrumpic filter +- libstagefright support removed +- spectrumsynth filter +- ahistogram filter +- only seek with the right mouse button in ffplay +- toggle full screen when double-clicking with the left mouse button in ffplay +- afftfilt filter +- convolution filter +- libquvi support removed +- support for dvaudio in wav and avi +- libaacplus and libvo-aacenc support removed +- Cineform HD decoder +- new DCA decoder with full support for DTS-HD extensions +- significant performance improvements in Windows Television (WTV) demuxer +- nnedi deinterlacer +- streamselect video and astreamselect audio filter +- swaprect filter +- metadata video and ametadata audio filter +- SMPTE VC-2 HQ profile support for the Dirac decoder +- SMPTE VC-2 native encoder supporting the HQ profile + + +version 2.8: +- colorkey video filter +- BFSTM/BCSTM demuxer +- little-endian ADPCM_THP decoder +- Hap decoder and encoder +- DirectDraw Surface image/texture decoder +- ssim filter +- optional new ASF demuxer +- showvolume filter +- Many improvements to the JPEG 2000 decoder +- Go2Meeting decoding support +- adrawgraph audio and drawgraph video filter +- removegrain video filter +- Intel QSV-accelerated MPEG-2 video and HEVC encoding +- Intel QSV-accelerated MPEG-2 video and HEVC decoding +- Intel QSV-accelerated VC-1 video decoding +- libkvazaar HEVC encoder +- erosion, dilation, deflate and inflate video filters +- Dynamic Audio Normalizer as dynaudnorm filter +- Reverse video and areverse audio filter +- Random filter +- deband filter +- AAC fixed-point decoding +- sidechaincompress audio filter +- bitstream filter for converting HEVC from MP4 to Annex B +- acrossfade audio filter +- allyuv and allrgb video sources +- atadenoise video filter +- OS X VideoToolbox support +- aphasemeter filter +- showfreqs filter +- vectorscope filter +- waveform filter +- hstack and vstack filter +- Support DNx100 (1440x1080@8) +- VAAPI hevc hwaccel +- VDPAU hevc hwaccel +- framerate filter +- Switched default encoders for webm to VP9 and Opus +- Removed experimental flag from the JPEG 2000 encoder + + +version 2.7: +- FFT video filter +- TDSC decoder +- DTS lossless extension (XLL) decoding (not lossless, disabled by default) +- showwavespic filter +- DTS decoding through libdcadec +- Drop support for nvenc API before 5.0 +- nvenc HEVC encoder +- Detelecine filter +- Intel QSV-accelerated H.264 encoding +- MMAL-accelerated H.264 decoding +- basic APNG encoder and muxer with default extension "apng" +- unpack DivX-style packed B-frames in MPEG-4 bitstream filter +- WebM Live Chunk Muxer +- nvenc level and tier options +- chorus filter +- Canopus HQ/HQA decoder +- Automatically rotate videos based on metadata in ffmpeg +- improved Quickdraw compatibility +- VP9 high bit-depth and extended colorspaces decoding support +- WebPAnimEncoder API when available for encoding and muxing WebP +- Direct3D11-accelerated decoding +- Support Secure Transport +- Multipart JPEG demuxer + + +version 2.6: +- nvenc encoder +- 10bit spp filter +- colorlevels filter +- RIFX format for *.wav files - RTP/mpegts muxer -- VP8 in Ogg demuxing +- non continuous cache protocol support +- tblend filter +- cropdetect support for non 8bpp, absolute (if limit >= 1) and relative (if limit < 1.0) threshold +- Camellia symmetric block cipher - OpenH264 encoder wrapper +- VOC seeking support +- Closed caption Decoder +- fspp, uspp, pp7 MPlayer postprocessing filters ported to native filters +- showpalette filter +- Twofish symmetric block cipher - Support DNx100 (960x720@8) -- Direct3D11-accelerated decoding +- eq2 filter ported from libmpcodecs as eq filter +- removed libmpcodecs +- Changed default DNxHD colour range in QuickTime .mov derivatives to mpeg range +- ported softpulldown filter from libmpcodecs as repeatfields filter +- dcshift filter +- RTP depacketizer for loss tolerant payload format for MP3 audio (RFC 5219) +- RTP depacketizer for AC3 payload format (RFC 4184) +- palettegen and paletteuse filters +- VP9 RTP payload format (draft 0) experimental depacketizer +- RTP depacketizer for DV (RFC 6469) - DXVA2-accelerated HEVC decoding - AAC ELD 480 decoding - Intel QSV-accelerated H.264 decoding diff --cc configure index 1b90d8e9a1,4510100f38..2cf18ecc12 --- a/configure +++ b/configure @@@ -2692,6 -2218,6 +2692,8 @@@ hevc_d3d11va2_hwaccel_deps="d3d11va DXV hevc_d3d11va2_hwaccel_select="hevc_decoder" hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" hevc_dxva2_hwaccel_select="hevc_decoder" ++hevc_nvdec_hwaccel_deps="cuda nvdec" ++hevc_nvdec_hwaccel_select="hevc_decoder" hevc_qsv_hwaccel_deps="libmfx" hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" hevc_vaapi_hwaccel_select="hevc_decoder" diff --cc libavcodec/Makefile index db1f70784a,bb568ddbe4..7ac4e13a06 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@@ -836,16 -634,16 +836,17 @@@ OBJS-$(CONFIG_VIDEOTOOLBOX OBJS-$(CONFIG_VDPAU) += vdpau.o OBJS-$(CONFIG_H263_VAAPI_HWACCEL) += vaapi_mpeg4.o -OBJS-$(CONFIG_H264_CUVID_HWACCEL) += cuvid_h264.o +OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o OBJS-$(CONFIG_H264_D3D11VA_HWACCEL) += dxva2_h264.o OBJS-$(CONFIG_H264_DXVA2_HWACCEL) += dxva2_h264.o +OBJS-$(CONFIG_H264_NVDEC_HWACCEL) += nvdec_h264.o OBJS-$(CONFIG_H264_QSV_HWACCEL) += qsvdec_h2645.o OBJS-$(CONFIG_H264_VAAPI_HWACCEL) += vaapi_h264.o -OBJS-$(CONFIG_H264_VDA_HWACCEL) += vda_h264.o OBJS-$(CONFIG_H264_VDPAU_HWACCEL) += vdpau_h264.o -OBJS-$(CONFIG_HEVC_CUVID_HWACCEL) += cuvid_hevc.o +OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o ++OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec_h2645.o OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o diff --cc libavcodec/allcodecs.c index c58f99c176,4ece4307a0..c817003693 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@@ -79,7 -82,6 +79,8 @@@ static void register_all(void REGISTER_HWACCEL(HEVC_D3D11VA, hevc_d3d11va); REGISTER_HWACCEL(HEVC_D3D11VA2, hevc_d3d11va2); REGISTER_HWACCEL(HEVC_DXVA2, hevc_dxva2); ++ REGISTER_HWACCEL(HEVC_NVDEC, hevc_nvdec); + REGISTER_HWACCEL(HEVC_MEDIACODEC, hevc_mediacodec); REGISTER_HWACCEL(HEVC_QSV, hevc_qsv); REGISTER_HWACCEL(HEVC_VAAPI, hevc_vaapi); REGISTER_HWACCEL(HEVC_VDPAU, hevc_vdpau); diff --cc libavcodec/hevcdec.c index 403a8cf454,a1619cf4bd..6dd6d0c53c --- a/libavcodec/hevcdec.c +++ b/libavcodec/hevcdec.c @@@ -352,34 -383,17 +352,38 @@@ static void export_stream_params(AVCode static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) { - #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ - CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL + \ - CONFIG_HEVC_CUVID_HWACCEL) +#define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ + CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ ++ CONFIG_HEVC_NVDEC_HWACCEL + \ + CONFIG_HEVC_VAAPI_HWACCEL + \ + CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ + CONFIG_HEVC_VDPAU_HWACCEL) enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; - if (sps->pix_fmt == AV_PIX_FMT_YUV420P || sps->pix_fmt == AV_PIX_FMT_YUVJ420P || - sps->pix_fmt == AV_PIX_FMT_YUV420P10) { + switch (sps->pix_fmt) { + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUVJ420P: +#if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; +#endif #if CONFIG_HEVC_D3D11VA_HWACCEL *fmt++ = AV_PIX_FMT_D3D11VA_VLD; *fmt++ = AV_PIX_FMT_D3D11; #endif +#if CONFIG_HEVC_VAAPI_HWACCEL + *fmt++ = AV_PIX_FMT_VAAPI; +#endif +#if CONFIG_HEVC_VDPAU_HWACCEL + *fmt++ = AV_PIX_FMT_VDPAU; +#endif ++#if CONFIG_HEVC_NVDEC_HWACCEL ++ *fmt++ = AV_PIX_FMT_CUDA; ++#endif +#if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; +#endif + break; + case AV_PIX_FMT_YUV420P10: #if CONFIG_HEVC_DXVA2_HWACCEL *fmt++ = AV_PIX_FMT_DXVA2_VLD; #endif @@@ -390,10 -400,17 +394,13 @@@ #if CONFIG_HEVC_VAAPI_HWACCEL *fmt++ = AV_PIX_FMT_VAAPI; #endif -#if CONFIG_HEVC_CUVID_HWACCEL && HAVE_CUVIDDECODECREATEINFO_BITDEPTHMINUS8 - *fmt++ = AV_PIX_FMT_CUDA; +#if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; #endif - } - if (sps->pix_fmt == AV_PIX_FMT_YUV420P || sps->pix_fmt == AV_PIX_FMT_YUVJ420P) { -#if CONFIG_HEVC_CUVID_HWACCEL && !HAVE_CUVIDDECODECREATEINFO_BITDEPTHMINUS8 ++#if CONFIG_HEVC_NVDEC_HWACCEL + *fmt++ = AV_PIX_FMT_CUDA; + #endif -#if CONFIG_HEVC_VDPAU_HWACCEL - *fmt++ = AV_PIX_FMT_VDPAU; -#endif + break; } *fmt++ = sps->pix_fmt; diff --cc libavcodec/nvdec.c index 9ca9faa378,0000000000..ab66b91a92 mode 100644,000000..100644 --- a/libavcodec/nvdec.c +++ b/libavcodec/nvdec.c @@@ -1,431 -1,0 +1,432 @@@ +/* + * HW decode acceleration through NVDEC + * + * Copyright (c) 2016 Anton Khirnov + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/common.h" +#include "libavutil/error.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/pixdesc.h" +#include "libavutil/pixfmt.h" + +#include "avcodec.h" +#include "decode.h" +#include "nvdec.h" +#include "internal.h" + +typedef struct NVDECDecoder { + CUvideodecoder decoder; + + AVBufferRef *hw_device_ref; + CUcontext cuda_ctx; + + CudaFunctions *cudl; + CuvidFunctions *cvdl; +} NVDECDecoder; + +typedef struct NVDECFramePool { + unsigned int dpb_size; + unsigned int nb_allocated; +} NVDECFramePool; + +static int map_avcodec_id(enum AVCodecID id) +{ + switch (id) { + case AV_CODEC_ID_H264: return cudaVideoCodec_H264; ++ case AV_CODEC_ID_HEVC: return cudaVideoCodec_HEVC; + } + return -1; +} + +static int map_chroma_format(enum AVPixelFormat pix_fmt) +{ + int shift_h = 0, shift_v = 0; + + av_pix_fmt_get_chroma_sub_sample(pix_fmt, &shift_h, &shift_v); + + if (shift_h == 1 && shift_v == 1) + return cudaVideoChromaFormat_420; + else if (shift_h == 1 && shift_v == 0) + return cudaVideoChromaFormat_422; + else if (shift_h == 0 && shift_v == 0) + return cudaVideoChromaFormat_444; + + return -1; +} + +static void nvdec_decoder_free(void *opaque, uint8_t *data) +{ + NVDECDecoder *decoder = (NVDECDecoder*)data; + + if (decoder->decoder) + decoder->cvdl->cuvidDestroyDecoder(decoder->decoder); + + av_buffer_unref(&decoder->hw_device_ref); + + cuvid_free_functions(&decoder->cvdl); + + av_freep(&decoder); +} + +static int nvdec_decoder_create(AVBufferRef **out, AVBufferRef *hw_device_ref, + CUVIDDECODECREATEINFO *params, void *logctx) +{ + AVHWDeviceContext *hw_device_ctx = (AVHWDeviceContext*)hw_device_ref->data; + AVCUDADeviceContext *device_hwctx = hw_device_ctx->hwctx; + + AVBufferRef *decoder_ref; + NVDECDecoder *decoder; + + CUcontext dummy; + CUresult err; + int ret; + + decoder = av_mallocz(sizeof(*decoder)); + if (!decoder) + return AVERROR(ENOMEM); + + decoder_ref = av_buffer_create((uint8_t*)decoder, sizeof(*decoder), + nvdec_decoder_free, NULL, AV_BUFFER_FLAG_READONLY); + if (!decoder_ref) { + av_freep(&decoder); + return AVERROR(ENOMEM); + } + + decoder->hw_device_ref = av_buffer_ref(hw_device_ref); + if (!decoder->hw_device_ref) { + ret = AVERROR(ENOMEM); + goto fail; + } + decoder->cuda_ctx = device_hwctx->cuda_ctx; + decoder->cudl = device_hwctx->internal->cuda_dl; + + ret = cuvid_load_functions(&decoder->cvdl); + if (ret < 0) { + av_log(logctx, AV_LOG_ERROR, "Failed loading nvcuvid.\n"); + goto fail; + } + + err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx); + if (err != CUDA_SUCCESS) { + ret = AVERROR_UNKNOWN; + goto fail; + } + + err = decoder->cvdl->cuvidCreateDecoder(&decoder->decoder, params); + + decoder->cudl->cuCtxPopCurrent(&dummy); + + if (err != CUDA_SUCCESS) { + av_log(logctx, AV_LOG_ERROR, "Error creating a NVDEC decoder: %d\n", err); + ret = AVERROR_UNKNOWN; + goto fail; + } + + *out = decoder_ref; + + return 0; +fail: + av_buffer_unref(&decoder_ref); + return ret; +} + +static AVBufferRef *nvdec_decoder_frame_alloc(void *opaque, int size) +{ + NVDECFramePool *pool = opaque; + AVBufferRef *ret; + + if (pool->nb_allocated >= pool->dpb_size) + return NULL; + + ret = av_buffer_alloc(sizeof(unsigned int)); + if (!ret) + return NULL; + + *(unsigned int*)ret->data = pool->nb_allocated++; + + return ret; +} + +int ff_nvdec_decode_uninit(AVCodecContext *avctx) +{ + NVDECContext *ctx = avctx->internal->hwaccel_priv_data; + + av_freep(&ctx->bitstream); + ctx->bitstream_len = 0; + ctx->bitstream_allocated = 0; + + av_freep(&ctx->slice_offsets); + ctx->nb_slices = 0; + ctx->slice_offsets_allocated = 0; + + av_buffer_unref(&ctx->decoder_ref); + av_buffer_pool_uninit(&ctx->decoder_pool); + + return 0; +} + +int ff_nvdec_decode_init(AVCodecContext *avctx, unsigned int dpb_size) +{ + NVDECContext *ctx = avctx->internal->hwaccel_priv_data; + + NVDECFramePool *pool; + AVHWFramesContext *frames_ctx; + const AVPixFmtDescriptor *sw_desc; + + CUVIDDECODECREATEINFO params = { 0 }; + + int cuvid_codec_type, cuvid_chroma_format; + int ret = 0; + + sw_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt); + if (!sw_desc) + return AVERROR_BUG; + + cuvid_codec_type = map_avcodec_id(avctx->codec_id); + if (cuvid_codec_type < 0) { + av_log(avctx, AV_LOG_ERROR, "Unsupported codec ID\n"); + return AVERROR_BUG; + } + + cuvid_chroma_format = map_chroma_format(avctx->sw_pix_fmt); + if (cuvid_chroma_format < 0) { + av_log(avctx, AV_LOG_ERROR, "Unsupported chroma format\n"); + return AVERROR(ENOSYS); + } + + if (avctx->thread_type & FF_THREAD_FRAME) + dpb_size += avctx->thread_count; + + if (!avctx->hw_frames_ctx) { + AVHWFramesContext *frames_ctx; + + if (!avctx->hw_device_ctx) { + av_log(avctx, AV_LOG_ERROR, "A hardware device or frames context " + "is required for CUVID decoding.\n"); + return AVERROR(EINVAL); + } + + avctx->hw_frames_ctx = av_hwframe_ctx_alloc(avctx->hw_device_ctx); + if (!avctx->hw_frames_ctx) + return AVERROR(ENOMEM); + frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data; + + frames_ctx->format = AV_PIX_FMT_CUDA; + frames_ctx->width = avctx->coded_width; + frames_ctx->height = avctx->coded_height; + frames_ctx->sw_format = AV_PIX_FMT_NV12; + frames_ctx->sw_format = sw_desc->comp[0].depth > 8 ? + AV_PIX_FMT_P010 : AV_PIX_FMT_NV12; + frames_ctx->initial_pool_size = dpb_size; + + ret = av_hwframe_ctx_init(avctx->hw_frames_ctx); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Error initializing internal frames context\n"); + return ret; + } + } + frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data; + + params.ulWidth = avctx->coded_width; + params.ulHeight = avctx->coded_height; + params.ulTargetWidth = avctx->coded_width; + params.ulTargetHeight = avctx->coded_height; + params.bitDepthMinus8 = sw_desc->comp[0].depth - 8; + params.OutputFormat = params.bitDepthMinus8 ? + cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12; + params.CodecType = cuvid_codec_type; + params.ChromaFormat = cuvid_chroma_format; + params.ulNumDecodeSurfaces = dpb_size; + params.ulNumOutputSurfaces = 1; + + ret = nvdec_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, ¶ms, avctx); + if (ret < 0) + return ret; + + pool = av_mallocz(sizeof(*pool)); + if (!pool) { + ret = AVERROR(ENOMEM); + goto fail; + } + pool->dpb_size = dpb_size; + + ctx->decoder_pool = av_buffer_pool_init2(sizeof(int), pool, + nvdec_decoder_frame_alloc, av_free); + if (!ctx->decoder_pool) { + ret = AVERROR(ENOMEM); + goto fail; + } + + return 0; +fail: + ff_nvdec_decode_uninit(avctx); + return ret; +} + +static void nvdec_fdd_priv_free(void *priv) +{ + NVDECFrame *cf = priv; + + if (!cf) + return; + + av_buffer_unref(&cf->idx_ref); + av_buffer_unref(&cf->decoder_ref); + + av_freep(&priv); +} + +static int nvdec_retrieve_data(void *logctx, AVFrame *frame) +{ + FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; + NVDECFrame *cf = (NVDECFrame*)fdd->hwaccel_priv; + NVDECDecoder *decoder = (NVDECDecoder*)cf->decoder_ref->data; + + CUVIDPROCPARAMS vpp = { .progressive_frame = 1 }; + + CUresult err; + CUcontext dummy; + CUdeviceptr devptr; + + unsigned int pitch, i; + unsigned int offset = 0; + int ret = 0; + + err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx); + if (err != CUDA_SUCCESS) + return AVERROR_UNKNOWN; + + err = decoder->cvdl->cuvidMapVideoFrame(decoder->decoder, cf->idx, &devptr, + &pitch, &vpp); + if (err != CUDA_SUCCESS) { + av_log(logctx, AV_LOG_ERROR, "Error mapping a picture with CUVID: %d\n", + err); + ret = AVERROR_UNKNOWN; + goto finish; + } + + for (i = 0; frame->data[i]; i++) { + CUDA_MEMCPY2D cpy = { + .srcMemoryType = CU_MEMORYTYPE_DEVICE, + .dstMemoryType = CU_MEMORYTYPE_DEVICE, + .srcDevice = devptr, + .dstDevice = (CUdeviceptr)frame->data[i], + .srcPitch = pitch, + .dstPitch = frame->linesize[i], + .srcY = offset, + .WidthInBytes = FFMIN(pitch, frame->linesize[i]), + .Height = frame->height >> (i ? 1 : 0), + }; + + err = decoder->cudl->cuMemcpy2D(&cpy); + if (err != CUDA_SUCCESS) { + av_log(logctx, AV_LOG_ERROR, "Error copying decoded frame: %d\n", + err); + ret = AVERROR_UNKNOWN; + goto copy_fail; + } + + offset += cpy.Height; + } + +copy_fail: + decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); + +finish: + decoder->cudl->cuCtxPopCurrent(&dummy); + return ret; +} + +int ff_nvdec_start_frame(AVCodecContext *avctx, AVFrame *frame) +{ + NVDECContext *ctx = avctx->internal->hwaccel_priv_data; + FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; + NVDECFrame *cf = NULL; + int ret; + + ctx->bitstream_len = 0; + ctx->nb_slices = 0; + + if (fdd->hwaccel_priv) + return 0; + + cf = av_mallocz(sizeof(*cf)); + if (!cf) + return AVERROR(ENOMEM); + + cf->decoder_ref = av_buffer_ref(ctx->decoder_ref); + if (!cf->decoder_ref) + goto fail; + + cf->idx_ref = av_buffer_pool_get(ctx->decoder_pool); + if (!cf->idx_ref) { + av_log(avctx, AV_LOG_ERROR, "No decoder surfaces left\n"); + ret = AVERROR(ENOMEM); + goto fail; + } + cf->idx = *(unsigned int*)cf->idx_ref->data; + + fdd->hwaccel_priv = cf; + fdd->hwaccel_priv_free = nvdec_fdd_priv_free; + fdd->post_process = nvdec_retrieve_data; + + return 0; +fail: + nvdec_fdd_priv_free(cf); + return ret; + +} + +int ff_nvdec_end_frame(AVCodecContext *avctx) +{ + NVDECContext *ctx = avctx->internal->hwaccel_priv_data; + NVDECDecoder *decoder = (NVDECDecoder*)ctx->decoder_ref->data; + CUVIDPICPARAMS *pp = &ctx->pic_params; + + CUresult err; + CUcontext dummy; + + int ret = 0; + + pp->nBitstreamDataLen = ctx->bitstream_len; + pp->pBitstreamData = ctx->bitstream; + pp->nNumSlices = ctx->nb_slices; + pp->pSliceDataOffsets = ctx->slice_offsets; + + err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx); + if (err != CUDA_SUCCESS) + return AVERROR_UNKNOWN; + + err = decoder->cvdl->cuvidDecodePicture(decoder->decoder, &ctx->pic_params); + if (err != CUDA_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Error decoding a picture with NVDEC: %d\n", + err); + ret = AVERROR_UNKNOWN; + goto finish; + } + +finish: + decoder->cudl->cuCtxPopCurrent(&dummy); + + return ret; +} diff --cc libavcodec/nvdec_hevc.c index 0000000000,0000000000..3c40ab2bea new file mode 100644 --- /dev/null +++ b/libavcodec/nvdec_hevc.c @@@ -1,0 -1,0 +1,279 @@@ ++/* ++ * HEVC HW decode acceleration through NVDEC ++ * ++ * Copyright (c) 2017 Anton Khirnov ++ * ++ * This file is part of Libav. ++ * ++ * Libav is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * Libav is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with Libav; if not, write to the Free Software Foundation, ++ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include <stdint.h> ++#include <string.h> ++ ++#include "avcodec.h" ++#include "nvdec.h" ++#include "decode.h" ++#include "internal.h" ++#include "hevcdec.h" ++#include "hevc_data.h" ++ ++static void dpb_add(CUVIDHEVCPICPARAMS *pp, int idx, const HEVCFrame *src) ++{ ++ FrameDecodeData *fdd = (FrameDecodeData*)src->frame->private_ref->data; ++ const NVDECFrame *cf = fdd->hwaccel_priv; ++ ++ pp->RefPicIdx[idx] = cf ? cf->idx : -1; ++ pp->PicOrderCntVal[idx] = src->poc; ++ pp->IsLongTerm[idx] = !!(src->flags & HEVC_FRAME_FLAG_LONG_REF); ++} ++ ++static void fill_scaling_lists(CUVIDHEVCPICPARAMS *ppc, const HEVCContext *s) ++{ ++ const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ? ++ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list; ++ int i, j, pos; ++ ++ for (i = 0; i < 6; i++) { ++ for (j = 0; j < 16; j++) { ++ pos = 4 * ff_hevc_diag_scan4x4_y[j] + ff_hevc_diag_scan4x4_x[j]; ++ ppc->ScalingList4x4[i][j] = sl->sl[0][i][pos]; ++ } ++ ++ for (j = 0; j < 64; j++) { ++ pos = 8 * ff_hevc_diag_scan8x8_y[j] + ff_hevc_diag_scan8x8_x[j]; ++ ppc->ScalingList8x8[i][j] = sl->sl[1][i][pos]; ++ ppc->ScalingList16x16[i][j] = sl->sl[2][i][pos]; ++ ++ if (i < 2) ++ ppc->ScalingList32x32[i][j] = sl->sl[3][i][pos]; ++ } ++ } ++ ++ memcpy(ppc->ScalingListDCCoeff16x16, sl->sl_dc[0], sizeof(ppc->ScalingListDCCoeff16x16)); ++ memcpy(ppc->ScalingListDCCoeff32x32, sl->sl_dc[1], sizeof(ppc->ScalingListDCCoeff32x32)); ++} ++ ++static int nvdec_hevc_start_frame(AVCodecContext *avctx, ++ const uint8_t *buffer, uint32_t size) ++{ ++ const HEVCContext *s = avctx->priv_data; ++ const HEVCPPS *pps = s->ps.pps; ++ const HEVCSPS *sps = s->ps.sps; ++ ++ NVDECContext *ctx = avctx->internal->hwaccel_priv_data; ++ CUVIDPICPARAMS *pp = &ctx->pic_params; ++ CUVIDHEVCPICPARAMS *ppc = &pp->CodecSpecific.hevc; ++ FrameDecodeData *fdd; ++ NVDECFrame *cf; ++ ++ int i, j, dpb_size, ret; ++ ++ ret = ff_nvdec_start_frame(avctx, s->ref->frame); ++ if (ret < 0) ++ return ret; ++ ++ fdd = (FrameDecodeData*)s->ref->frame->private_ref->data; ++ cf = (NVDECFrame*)fdd->hwaccel_priv; ++ ++ *pp = (CUVIDPICPARAMS) { ++ .PicWidthInMbs = sps->width / 16, ++ .FrameHeightInMbs = sps->height / 16, ++ .CurrPicIdx = cf->idx, ++ .ref_pic_flag = 1, ++ .intra_pic_flag = 0, ++ ++ .CodecSpecific.hevc = { ++ .pic_width_in_luma_samples = sps->width, ++ .pic_height_in_luma_samples = sps->height, ++ .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3, ++ .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size, ++ .log2_min_transform_block_size_minus2 = sps->log2_min_tb_size - 2, ++ .log2_diff_max_min_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size, ++ .pcm_enabled_flag = sps->pcm_enabled_flag, ++ .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm_enabled_flag ? sps->pcm.log2_min_pcm_cb_size - 3 : 0, ++ .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size, ++ .pcm_sample_bit_depth_luma_minus1 = sps->pcm_enabled_flag ? sps->pcm.bit_depth - 1 : 0, ++ .pcm_sample_bit_depth_chroma_minus1 = sps->pcm_enabled_flag ? sps->pcm.bit_depth_chroma - 1 : 0, ++ .pcm_loop_filter_disabled_flag = sps->pcm.loop_filter_disable_flag, ++ .strong_intra_smoothing_enabled_flag = sps->sps_strong_intra_smoothing_enable_flag, ++ .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra, ++ .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter, ++ .amp_enabled_flag = sps->amp_enabled_flag, ++ .separate_colour_plane_flag = sps->separate_colour_plane_flag, ++ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, ++ .num_short_term_ref_pic_sets = sps->nb_st_rps, ++ .long_term_ref_pics_present_flag = sps->long_term_ref_pics_present_flag, ++ .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps, ++ .sps_temporal_mvp_enabled_flag = sps->sps_temporal_mvp_enabled_flag, ++ .sample_adaptive_offset_enabled_flag = sps->sao_enabled, ++ .scaling_list_enable_flag = sps->scaling_list_enable_flag, ++ .IrapPicFlag = IS_IRAP(s), ++ .IdrPicFlag = IS_IDR(s), ++ .bit_depth_luma_minus8 = sps->bit_depth - 8, ++ .bit_depth_chroma_minus8 = sps->bit_depth - 8, ++ ++ .dependent_slice_segments_enabled_flag = pps->dependent_slice_segments_enabled_flag, ++ .slice_segment_header_extension_present_flag = pps->slice_header_extension_present_flag, ++ .sign_data_hiding_enabled_flag = pps->sign_data_hiding_flag, ++ .cu_qp_delta_enabled_flag = pps->cu_qp_delta_enabled_flag, ++ .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth, ++ .init_qp_minus26 = pps->pic_init_qp_minus26, ++ .pps_cb_qp_offset = pps->cb_qp_offset, ++ .pps_cr_qp_offset = pps->cr_qp_offset, ++ .constrained_intra_pred_flag = pps->constrained_intra_pred_flag, ++ .weighted_pred_flag = pps->weighted_pred_flag, ++ .weighted_bipred_flag = pps->weighted_bipred_flag, ++ .transform_skip_enabled_flag = pps->transform_skip_enabled_flag, ++ .transquant_bypass_enabled_flag = pps->transquant_bypass_enable_flag, ++ .entropy_coding_sync_enabled_flag = pps->entropy_coding_sync_enabled_flag, ++ .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2, ++ .num_extra_slice_header_bits = pps->num_extra_slice_header_bits, ++ .loop_filter_across_tiles_enabled_flag = pps->loop_filter_across_tiles_enabled_flag, ++ .loop_filter_across_slices_enabled_flag = pps->seq_loop_filter_across_slices_enabled_flag, ++ .output_flag_present_flag = pps->output_flag_present_flag, ++ .num_ref_idx_l0_default_active_minus1 = pps->num_ref_idx_l0_default_active - 1, ++ .num_ref_idx_l1_default_active_minus1 = pps->num_ref_idx_l1_default_active - 1, ++ .lists_modification_present_flag = pps->lists_modification_present_flag, ++ .cabac_init_present_flag = pps->cabac_init_present_flag, ++ .pps_slice_chroma_qp_offsets_present_flag = pps->pic_slice_level_chroma_qp_offsets_present_flag, ++ .deblocking_filter_override_enabled_flag = pps->deblocking_filter_override_enabled_flag, ++ .pps_deblocking_filter_disabled_flag = pps->disable_dbf, ++ .pps_beta_offset_div2 = pps->beta_offset / 2, ++ .pps_tc_offset_div2 = pps->tc_offset / 2, ++ .tiles_enabled_flag = pps->tiles_enabled_flag, ++ .uniform_spacing_flag = pps->uniform_spacing_flag, ++ .num_tile_columns_minus1 = pps->num_tile_columns - 1, ++ .num_tile_rows_minus1 = pps->num_tile_rows - 1, ++ ++ .NumBitsForShortTermRPSInSlice = s->sh.short_term_rps ? s->sh.short_term_ref_pic_set_size : 0, ++ .NumDeltaPocsOfRefRpsIdx = s->sh.short_term_rps ? s->sh.short_term_rps->rps_idx_num_delta_pocs : 0, ++ .NumPocTotalCurr = s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs + ++ s->rps[LT_CURR].nb_refs, ++ .NumPocStCurrBefore = s->rps[ST_CURR_BEF].nb_refs, ++ .NumPocStCurrAfter = s->rps[ST_CURR_AFT].nb_refs, ++ .NumPocLtCurr = s->rps[LT_CURR].nb_refs, ++ .CurrPicOrderCntVal = s->ref->poc, ++ }, ++ }; ++ ++ if (pps->num_tile_columns > FF_ARRAY_ELEMS(ppc->column_width_minus1) || ++ pps->num_tile_rows > FF_ARRAY_ELEMS(ppc->row_height_minus1)) { ++ av_log(avctx, AV_LOG_ERROR, "Too many tiles\n"); ++ return AVERROR(ENOSYS); ++ } ++ for (i = 0; i < pps->num_tile_columns; i++) ++ ppc->column_width_minus1[i] = pps->column_width[i] - 1; ++ for (i = 0; i < pps->num_tile_rows; i++) ++ ppc->row_height_minus1[i] = pps->row_height[i] - 1; ++ ++ if (s->rps[LT_CURR].nb_refs > FF_ARRAY_ELEMS(ppc->RefPicSetLtCurr) || ++ s->rps[ST_CURR_BEF].nb_refs > FF_ARRAY_ELEMS(ppc->RefPicSetStCurrBefore) || ++ s->rps[ST_CURR_AFT].nb_refs > FF_ARRAY_ELEMS(ppc->RefPicSetStCurrAfter)) { ++ av_log(avctx, AV_LOG_ERROR, "Too many reference frames\n"); ++ return AVERROR(ENOSYS); ++ } ++ ++ dpb_size = 0; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ const HEVCFrame *ref = &s->DPB[i]; ++ if (!(ref->flags & (HEVC_FRAME_FLAG_SHORT_REF | HEVC_FRAME_FLAG_LONG_REF))) ++ continue; ++ if (dpb_size >= FF_ARRAY_ELEMS(ppc->RefPicIdx)) { ++ av_log(avctx, AV_LOG_ERROR, "Too many reference frames\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ dpb_add(ppc, dpb_size++, ref); ++ ++ } ++ for (i = dpb_size; i < FF_ARRAY_ELEMS(ppc->RefPicIdx); i++) ++ ppc->RefPicIdx[i] = -1; ++ ++ for (i = 0; i < s->rps[ST_CURR_BEF].nb_refs; i++) { ++ for (j = 0; j < dpb_size; j++) { ++ if (ppc->PicOrderCntVal[j] == s->rps[ST_CURR_BEF].list[i]) { ++ ppc->RefPicSetStCurrBefore[i] = j; ++ break; ++ } ++ } ++ } ++ for (i = 0; i < s->rps[ST_CURR_AFT].nb_refs; i++) { ++ for (j = 0; j < dpb_size; j++) { ++ if (ppc->PicOrderCntVal[j] == s->rps[ST_CURR_AFT].list[i]) { ++ ppc->RefPicSetStCurrAfter[i] = j; ++ break; ++ } ++ } ++ } ++ for (i = 0; i < s->rps[LT_CURR].nb_refs; i++) { ++ for (j = 0; j < dpb_size; j++) { ++ if (ppc->PicOrderCntVal[j] == s->rps[LT_CURR].list[i]) { ++ ppc->RefPicSetLtCurr[i] = j; ++ break; ++ } ++ } ++ } ++ ++ fill_scaling_lists(ppc, s); ++ ++ return 0; ++} ++ ++static int nvdec_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, ++ uint32_t size) ++{ ++ NVDECContext *ctx = avctx->internal->hwaccel_priv_data; ++ void *tmp; ++ ++ tmp = av_fast_realloc(ctx->bitstream, &ctx->bitstream_allocated, ++ ctx->bitstream_len + size + 3); ++ if (!tmp) ++ return AVERROR(ENOMEM); ++ ctx->bitstream = tmp; ++ ++ tmp = av_fast_realloc(ctx->slice_offsets, &ctx->slice_offsets_allocated, ++ (ctx->nb_slices + 1) * sizeof(*ctx->slice_offsets)); ++ if (!tmp) ++ return AVERROR(ENOMEM); ++ ctx->slice_offsets = tmp; ++ ++ AV_WB24(ctx->bitstream + ctx->bitstream_len, 1); ++ memcpy(ctx->bitstream + ctx->bitstream_len + 3, buffer, size); ++ ctx->slice_offsets[ctx->nb_slices] = ctx->bitstream_len ; ++ ctx->bitstream_len += size + 3; ++ ctx->nb_slices++; ++ ++ return 0; ++} ++ ++static int nvdec_hevc_decode_init(AVCodecContext *avctx) ++{ ++ const HEVCContext *s = avctx->priv_data; ++ const HEVCSPS *sps = s->ps.sps; ++ return ff_nvdec_decode_init(avctx, sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + 1); ++} ++ ++AVHWAccel ff_hevc_nvdec_hwaccel = { ++ .name = "hevc_nvdec", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .pix_fmt = AV_PIX_FMT_CUDA, ++ .start_frame = nvdec_hevc_start_frame, ++ .end_frame = ff_nvdec_end_frame, ++ .decode_slice = nvdec_hevc_decode_slice, ++ .init = nvdec_hevc_decode_init, ++ .uninit = ff_nvdec_decode_uninit, ++ .priv_data_size = sizeof(NVDECContext), ++}; _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog