On 19/01/2020 13:11, Lynne wrote: > Jan 18, 2020, 20:23 by s...@jkqxz.net: > >> On 10/01/2020 21:05, Lynne wrote: >> >> The CUDA parts look like they could be split off into a separate commit? >> (It's already huge.) >> > > I don't really want broken commits or commits with dead code.
Shouldn't be - you've got nice #ifdef markers surrounding exactly the right pieces to splice out :) >>> @@ -3639,7 +3641,7 @@ avformat_deps="avcodec avutil" >>> avformat_suggest="libm network zlib" >>> avresample_deps="avutil" >>> avresample_suggest="libm" >>> -avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 >>> vaapi videotoolbox corefoundation corevideo coremedia bcrypt" >>> +avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 >>> vaapi vulkan videotoolbox corefoundation corevideo coremedia bcrypt" >>> postproc_deps="avutil gpl" >>> postproc_suggest="libm" >>> swresample_deps="avutil" >>> @@ -6626,6 +6628,9 @@ enabled vdpau && >>> >>> enabled crystalhd && check_lib crystalhd "stdint.h >>> libcrystalhd/libcrystalhd_if.h" DtsCrystalHDVersion -lcrystalhd >>> >>> +enabled vulkan && >>> + require_pkg_config vulkan "vulkan >= 1.1.97" "vulkan/vulkan.h" >>> vkCreateInstance >>> >> >> Presumably you have some specific requirement in mind which wants this >> version - can you note it somewhere? (Either here or in the commit message.) >> > > The DRM export/import and I think the semaphore import API. > Its not a new version, its been out for a long time. About a year, which probably isn't long enough to assume it's everywhere. I guess it doesn't really matter, though, because it will only ever increase when something extra is added. >>> ... >>> +static int cuda_device_derive(AVHWDeviceContext *device_ctx, >>> + AVHWDeviceContext *src_ctx, >>> + int flags) { >>> + AVCUDADeviceContext *hwctx = device_ctx->hwctx; >>> + CudaFunctions *cu; >>> + const char *src_uuid = NULL; >>> + CUcontext dummy; >>> + int ret, i, device_count, dev_active = 0; >>> + unsigned int dev_flags = 0; >>> + >>> + const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC; >>> + >>> + switch (src_ctx->type) { >>> +#if CONFIG_VULKAN >>> + case AV_HWDEVICE_TYPE_VULKAN: { >>> + AVVulkanDeviceContext *vkctx = src_ctx->hwctx; >>> + src_uuid = vkctx->device_uuid; >>> + break; >>> + } >>> +#endif >>> + default: >>> + return AVERROR(ENOSYS); >>> + } >>> + >>> + if (!src_uuid) { >>> + av_log(device_ctx, AV_LOG_ERROR, >>> + "Failed to get UUID of source device.\n"); >>> + goto error; >>> + } >>> + >>> + if (cuda_device_init(device_ctx) < 0) >>> + goto error; >>> + >>> + cu = hwctx->internal->cuda_dl; >>> + >>> + ret = CHECK_CU(cu->cuInit(0)); >>> + if (ret < 0) >>> + goto error; >>> + >>> + ret = CHECK_CU(cu->cuDeviceGetCount(&device_count)); >>> + if (ret < 0) >>> + goto error; >>> + >>> + hwctx->internal->cuda_device = -1; >>> + for (i = 0; i < device_count; i++) { >>> + CUdevice dev; >>> + CUuuid uuid; >>> + >>> + ret = CHECK_CU(cu->cuDeviceGet(&dev, i)); >>> + if (ret < 0) >>> + goto error; >>> + >>> + ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev)); >>> + if (ret < 0) >>> + goto error; >>> + >>> + if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) { >>> + hwctx->internal->cuda_device = dev; >>> + break; >>> + } >>> + } >>> + >>> + if (hwctx->internal->cuda_device == -1) { >>> + av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA >>> device.\n"); >>> >> >> This error is maybe more like "Can't find the matching CUDA device to the >> supplied Vulkan device". >> > > Let's keep it that way, its not meant to be vulkan specific, though its only > used by vulkan currently. Good point! Fair enough. >>> + goto error; >>> + } >>> + >>> + hwctx->internal->flags = flags; >>> + >>> + if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) { >>> + ret = >>> CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device, >>> &dev_flags, &dev_active)); >>> + if (ret < 0) >>> + goto error; >>> + >>> + if (dev_active && dev_flags != desired_flags) { >>> + av_log(device_ctx, AV_LOG_ERROR, "Primary context already >>> active with incompatible flags.\n"); >>> + goto error; >>> + } else if (dev_flags != desired_flags) { >>> + ret = >>> CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device, >>> desired_flags)); >>> + if (ret < 0) >>> + goto error; >>> + } >>> + >>> + ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx, >>> hwctx->internal->cuda_device)); >>> + if (ret < 0) >>> + goto error; >>> + } else { >>> + ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags, >>> hwctx->internal->cuda_device)); >>> + if (ret < 0) >>> + goto error; >>> + >>> + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); >>> + } >>> + >>> + hwctx->internal->is_allocated = 1; >>> + >>> + // Setting stream to NULL will make functions automatically use the >>> default CUstream >>> + hwctx->stream = NULL; >>> + >>> + return 0; >>> + >>> +error: >>> + cuda_device_uninit(device_ctx); >>> + return AVERROR_UNKNOWN; >>> +} >>> + >>> const HWContextType ff_hwcontext_type_cuda = { >>> .type = AV_HWDEVICE_TYPE_CUDA, >>> .name = "CUDA", >>> @@ -397,6 +517,7 @@ const HWContextType ff_hwcontext_type_cuda = { >>> .frames_priv_size = sizeof(CUDAFramesContext), >>> >>> .device_create = cuda_device_create, >>> + .device_derive = cuda_device_derive, >>> .device_init = cuda_device_init, >>> .device_uninit = cuda_device_uninit, >>> .frames_get_constraints = cuda_frames_get_constraints, >>> ... >>> diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c >>> new file mode 100644 >>> index 0000000000..d4eb8ffd35 >>> --- /dev/null >>> +++ b/libavutil/hwcontext_vulkan.c >>> @@ -0,0 +1,2804 @@ >>> ... >>> + >>> +static const struct { >>> + enum AVPixelFormat pixfmt; >>> + const VkFormat vkfmts[3]; >>> +} vk_pixfmt_map[] = { >>> + { AV_PIX_FMT_GRAY8, { VK_FORMAT_R8_UNORM } }, >>> + { AV_PIX_FMT_GRAY16, { VK_FORMAT_R16_UNORM } }, >>> + { AV_PIX_FMT_GRAYF32, { VK_FORMAT_R32_SFLOAT } }, >>> + >>> + { AV_PIX_FMT_NV12, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8G8_UNORM } }, >>> + { AV_PIX_FMT_P010, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16G16_UNORM } }, >>> >> >> Is P010 still safe when the low bits might have any value? >> > > Our padding is in the top bits. Vulkan defines it in the bottom bits, hence > we can't use it. ? Our padding is definitely in the bottom bits, since it's defined to be consistent with all the hardware using it. > I can't see why it wouldn't be safe. Every code that deals with user-supplied > frames must rely they're junk. Probably close enough for government work, but still slightly off. If the only operation you can do is load it into a float as UNORM then a top value with zeroes would be slightly off-white and the colourspace-conversion nazis will hunt you down. >>> + { AV_PIX_FMT_P016, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16G16_UNORM } }, >>> + >>> + { AV_PIX_FMT_YUV420P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, >>> VK_FORMAT_R8_UNORM } }, >>> + { AV_PIX_FMT_YUV422P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, >>> VK_FORMAT_R8_UNORM } }, >>> + { AV_PIX_FMT_YUV444P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, >>> VK_FORMAT_R8_UNORM } }, >>> + >>> + { AV_PIX_FMT_YUV420P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, >>> VK_FORMAT_R16_UNORM } }, >>> + { AV_PIX_FMT_YUV422P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, >>> VK_FORMAT_R16_UNORM } }, >>> + { AV_PIX_FMT_YUV444P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, >>> VK_FORMAT_R16_UNORM } }, >>> + >>> + { AV_PIX_FMT_ABGR, { VK_FORMAT_A8B8G8R8_UNORM_PACK32 } }, >>> + { AV_PIX_FMT_BGRA, { VK_FORMAT_B8G8R8A8_UNORM } }, >>> + { AV_PIX_FMT_RGBA, { VK_FORMAT_R8G8B8A8_UNORM } }, >>> + { AV_PIX_FMT_RGB24, { VK_FORMAT_R8G8B8_UNORM } }, >>> + { AV_PIX_FMT_BGR24, { VK_FORMAT_B8G8R8_UNORM } }, >>> + { AV_PIX_FMT_RGB48, { VK_FORMAT_R16G16B16_UNORM } }, >>> + { AV_PIX_FMT_RGBA64, { VK_FORMAT_R16G16B16A16_UNORM } }, >>> + { AV_PIX_FMT_RGB565, { VK_FORMAT_R5G6B5_UNORM_PACK16 } }, >>> + { AV_PIX_FMT_BGR565, { VK_FORMAT_B5G6R5_UNORM_PACK16 } }, >>> + { AV_PIX_FMT_BGR0, { VK_FORMAT_B8G8R8A8_UNORM } }, >>> + { AV_PIX_FMT_0BGR, { VK_FORMAT_A8B8G8R8_UNORM_PACK32 } }, >>> + { AV_PIX_FMT_RGB0, { VK_FORMAT_R8G8B8A8_UNORM } }, >>> + >>> + { AV_PIX_FMT_GBRPF32, { VK_FORMAT_R32_SFLOAT, VK_FORMAT_R32_SFLOAT, >>> VK_FORMAT_R32_SFLOAT } }, >>> +}; >>> + >>> ... >>> +static int vulkan_device_create(AVHWDeviceContext *ctx, const char *device, >>> + AVDictionary *opts, int flags) >>> +{ >>> + VulkanDeviceSelection dev_select = { 0 }; >>> + if (device && device[0]) { >>> + char *end = NULL; >>> + dev_select.index = strtol(device, &end, 10); >>> + if (end == device) { >>> + dev_select.index = 0; >>> + dev_select.name = device; >>> + } >>> + } >>> >> >> Is it worth making uuid=f00 work here as well? (From opts rather than >> device: "-init_hw_device vulkan:,uuid=f00".) >> > > Would like to, but I don't think its worth the extra dependency (libuuid, its > widespread on linux but I'd rather not NIH parsing). Ok. >>> + >>> + return vulkan_device_create_internal(ctx, &dev_select, opts, flags); >>> +} >>> ... >>> + >>> +static int vulkan_frames_init(AVHWFramesContext *hwfc) >>> +{ >>> + int err; >>> + AVVkFrame *f; >>> + AVVulkanFramesContext *hwctx = hwfc->hwctx; >>> + VulkanFramesPriv *fp = hwfc->internal->priv; >>> + AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx; >>> + VulkanDevicePriv *p = hwfc->device_ctx->internal->priv; >>> + >>> + if (hwfc->pool) >>> + return 0; >>> + >>> + /* Default pool flags */ >>> + hwctx->tiling = hwctx->tiling ? hwctx->tiling : p->use_linear_images ? >>> + VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL; >>> + >>> + hwctx->usage |= DEFAULT_USAGE_FLAGS; >>> >> >> Is it possible that this disallows some use-cases in a device where those >> default flags need not be not supported? For example, some sort of magic >> image-writer like a video decoder where the output images can only ever be >> used as a source by non-magic operations. Baking that into the API (as in >> the comment on usage in the header) seems bad if so. >> > > You need to support those flags to do anything useful with the images. > This restricts read-only images since those don't support the STORAGE flag. > Such images are the planar images, which DO support the STORAGE flag but only > for their subimages (e.g. individual planes). This is in spec, regardless > what Intel says and disagrees with (they advise to alias memory instead). We > don't use them anyway, and if a user wanted to repackage received frames as > planar (or unpackage them) they still can. I feel like devices with weird memory are still a problem here, but reading the code again I can see that the user-provided pool actually trumps the concern anyway - if anything funny is going on then the user can set it up however they want. >>> ... >>> + >>> +static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, >>> AVVkFrame **frame, >>> + AVDRMFrameDescriptor *desc) >>> +{ >>> + int err = 0; >>> + VkResult ret; >>> + AVVkFrame *f; >>> + AVHWDeviceContext *ctx = hwfc->device_ctx; >>> + AVVulkanDeviceContext *hwctx = ctx->hwctx; >>> + VulkanDevicePriv *p = ctx->internal->priv; >>> + const int planes = av_pix_fmt_count_planes(hwfc->sw_format); >>> + const AVPixFmtDescriptor *fmt_desc = >>> av_pix_fmt_desc_get(hwfc->sw_format); >>> + const int has_modifiers = p->extensions & EXT_DRM_MODIFIER_FLAGS; >>> + VkSubresourceLayout plane_data[AV_NUM_DATA_POINTERS]; >>> + VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS]; >>> + VkExternalMemoryHandleTypeFlagBits htype = >>> VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT; >>> + >>> + VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdPropertiesKHR); >>> + >>> + for (int i = 0; i < desc->nb_layers; i++) { >>> + if (desc->layers[i].nb_planes > 1) { >>> + av_log(ctx, AV_LOG_ERROR, "Cannot import DMABUFS with more >>> than 1 " >>> + "plane per layer!\n"); >>> + return AVERROR(EINVAL); >>> + } >>> + >>> + if (drm_to_vulkan_fmt(desc->layers[i].format) == >>> VK_FORMAT_UNDEFINED) { >>> + av_log(ctx, AV_LOG_ERROR, "Unsupported DMABUF layer >>> format!\n"); >>> >> >> Maybe say what the unsupported format is for someone reporting the message, >> since this is probably relatively easy to hit (e.g. YUYV). >> > > Sure, I'll just use the handy DRM API/define to translate DRM FOURCC uint32_t > to a string. > Oh wait, I can't because there is no such API. I've had to manually go > through every single define in the past, relying on blind luck and > speculation to match those up. DRM devs want people to suffer, as if the > big-endian confusion-inducing FOURCC isn't evidence enough for it. > Not willing to NIH something to pull the chars out of it yet. Next time I > have to look up a drm format id maybe. Print it with %#08x. I just think it should appear in the log so that it's possible for someone reading to log to see the format which failed; if they have to decode it a little bit themselves that isn't a disaster. >>> + return AVERROR(EINVAL); >>> + } >>> + } >>> + >>> + if (!(f = av_mallocz(sizeof(*f)))) { >>> + av_log(ctx, AV_LOG_ERROR, "Unable to allocate memory for >>> AVVkFrame!\n"); >>> + err = AVERROR(ENOMEM); >>> + goto fail; >>> + } >>> + >>> ... >>> + >>> +static int vulkan_map_to_drm(AVHWFramesContext *hwfc, AVFrame *dst, >>> + const AVFrame *src, int flags) >>> +{ >>> + int err = 0; >>> + VkResult ret; >>> + AVVkFrame *f = (AVVkFrame *)src->data[0]; >>> + VulkanDevicePriv *p = hwfc->device_ctx->internal->priv; >>> + AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx; >>> + const int planes = av_pix_fmt_count_planes(hwfc->sw_format); >>> + VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdKHR); >>> + VkImageDrmFormatModifierPropertiesEXT drm_mod = { >>> + .sType = >>> VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT, >>> + }; >>> >> >> Do you need a sync here for any writing being finished, or is it implicit >> somehow below? >> > > There isn't. This would be where we export a semaphore to VAAPI. > We could make a CPU sync by converting the image to read optimal and waiting > on the command queue's semaphore, but that would need another exec context. > Should we? > I don't have a way to test this ATM, my test program is gone, so if I do > touch it I'll need to write a new one, which is a lot of work. With ffmpeg I > can only test raw exporting without it actually being used by anything, since > everything complains about some missing contexts IIRC. You often get away with this because some part of the next operation ends up queued on the same hardware submission ring. I guess leave it unless you want to add the extra sync, but add a comment for the next person reading it. >>> + >>> + AVDRMFrameDescriptor *drm_desc = av_mallocz(sizeof(*drm_desc)); >>> + if (!drm_desc) >>> + return AVERROR(ENOMEM); >>> + >>> + err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, >>> &vulkan_unmap_to_drm, drm_desc); >>> + if (err < 0) >>> + goto end; >>> + >>> + if (p->extensions & EXT_DRM_MODIFIER_FLAGS) { >>> + VK_LOAD_PFN(hwctx->inst, vkGetImageDrmFormatModifierPropertiesEXT); >>> + ret = pfn_vkGetImageDrmFormatModifierPropertiesEXT(hwctx->act_dev, >>> f->img[0], >>> + &drm_mod); >>> + if (ret != VK_SUCCESS) { >>> + av_log(hwfc, AV_LOG_ERROR, "Failed to retrieve DRM format >>> modifier!\n"); >>> + err = AVERROR_EXTERNAL; >>> + goto end; >>> + } >>> + } >>> + >>> ... >>> + >>> +static int vulkan_transfer_data_to_mem(AVHWFramesContext *hwfc, AVFrame >>> *dst, >>> + const AVFrame *src) >>> +{ >>> + int err = 0; >>> + AVFrame tmp; >>> + AVVkFrame *f = (AVVkFrame *)src->data[0]; >>> + AVHWDeviceContext *dev_ctx = hwfc->device_ctx; >>> + ImageBuffer buf[AV_NUM_DATA_POINTERS] = { { 0 } }; >>> + const int planes = av_pix_fmt_count_planes(dst->format); >>> + int log2_chroma = av_pix_fmt_desc_get(dst->format)->log2_chroma_h; >>> + >>> + if (dst->width > hwfc->width || dst->height > hwfc->height) >>> + return AVERROR(EINVAL); >>> + >>> + /* For linear, host visiable images */ >>> + if (f->tiling == VK_IMAGE_TILING_LINEAR && >>> + f->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { >>> >> >> Is it generally expected that this is actually faster than the next option? >> (Because evil uncached memory is a thing.) >> > > Could be. On Intel it was faster, but now its as fast as tiled images. On > NVIDIA its slower. Its still faster on Intel for simple short filter chains. > Keeping it for cheap devices. Yep, fair enough. >>> + AVFrame *map = av_frame_alloc(); >>> + if (!map) >>> + return AVERROR(ENOMEM); >>> + map->format = dst->format; >>> + >>> + err = vulkan_map_frame_to_mem(hwfc, map, src, AV_HWFRAME_MAP_READ); >>> + if (err) >>> + return err; >>> + >>> + err = av_frame_copy(dst, map); >>> + av_frame_free(&map); >>> + return err; >>> + } >>> ... Thanks, - Mark _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".