2 * This file is part of FFmpeg.
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "hwcontext.h"
22 #include "hwcontext_internal.h"
23 #include "hwcontext_cuda_internal.h"
25 #include "hwcontext_vulkan.h"
27 #include "cuda_check.h"
33 typedef struct CUDAFramesContext
{
34 int shift_width
, shift_height
;
38 typedef struct CUDADeviceContext
{
39 AVCUDADeviceContext p
;
40 AVCUDADeviceContextInternal internal
;
43 static const enum AVPixelFormat supported_formats
[] = {
57 AV_PIX_FMT_YUV444P10MSB
,
58 AV_PIX_FMT_YUV444P12MSB
,
69 #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)
71 static int cuda_frames_get_constraints(AVHWDeviceContext
*ctx
,
73 AVHWFramesConstraints
*constraints
)
77 constraints
->valid_sw_formats
= av_malloc_array(FF_ARRAY_ELEMS(supported_formats
) + 1,
78 sizeof(*constraints
->valid_sw_formats
));
79 if (!constraints
->valid_sw_formats
)
80 return AVERROR(ENOMEM
);
82 for (i
= 0; i
< FF_ARRAY_ELEMS(supported_formats
); i
++)
83 constraints
->valid_sw_formats
[i
] = supported_formats
[i
];
84 constraints
->valid_sw_formats
[FF_ARRAY_ELEMS(supported_formats
)] = AV_PIX_FMT_NONE
;
86 constraints
->valid_hw_formats
= av_malloc_array(2, sizeof(*constraints
->valid_hw_formats
));
87 if (!constraints
->valid_hw_formats
)
88 return AVERROR(ENOMEM
);
90 constraints
->valid_hw_formats
[0] = AV_PIX_FMT_CUDA
;
91 constraints
->valid_hw_formats
[1] = AV_PIX_FMT_NONE
;
96 static void cuda_buffer_free(void *opaque
, uint8_t *data
)
98 AVHWFramesContext
*ctx
= opaque
;
99 AVHWDeviceContext
*device_ctx
= ctx
->device_ctx
;
100 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
101 CudaFunctions
*cu
= hwctx
->internal
->cuda_dl
;
105 CHECK_CU(cu
->cuCtxPushCurrent(hwctx
->cuda_ctx
));
107 CHECK_CU(cu
->cuMemFree((CUdeviceptr
)data
));
109 CHECK_CU(cu
->cuCtxPopCurrent(&dummy
));
112 static AVBufferRef
*cuda_pool_alloc(void *opaque
, size_t size
)
114 AVHWFramesContext
*ctx
= opaque
;
115 AVHWDeviceContext
*device_ctx
= ctx
->device_ctx
;
116 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
117 CudaFunctions
*cu
= hwctx
->internal
->cuda_dl
;
119 AVBufferRef
*ret
= NULL
;
120 CUcontext dummy
= NULL
;
124 err
= CHECK_CU(cu
->cuCtxPushCurrent(hwctx
->cuda_ctx
));
128 err
= CHECK_CU(cu
->cuMemAlloc(&data
, size
));
132 ret
= av_buffer_create((uint8_t*)data
, size
, cuda_buffer_free
, ctx
, 0);
134 CHECK_CU(cu
->cuMemFree(data
));
139 CHECK_CU(cu
->cuCtxPopCurrent(&dummy
));
143 static int cuda_frames_init(AVHWFramesContext
*ctx
)
145 AVHWDeviceContext
*device_ctx
= ctx
->device_ctx
;
146 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
147 CUDAFramesContext
*priv
= ctx
->hwctx
;
148 CudaFunctions
*cu
= hwctx
->internal
->cuda_dl
;
151 for (i
= 0; i
< FF_ARRAY_ELEMS(supported_formats
); i
++) {
152 if (ctx
->sw_format
== supported_formats
[i
])
155 if (i
== FF_ARRAY_ELEMS(supported_formats
)) {
156 av_log(ctx
, AV_LOG_ERROR
, "Pixel format '%s' is not supported\n",
157 av_get_pix_fmt_name(ctx
->sw_format
));
158 return AVERROR(ENOSYS
);
161 err
= CHECK_CU(cu
->cuDeviceGetAttribute(&priv
->tex_alignment
,
162 14 /* CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT */,
163 hwctx
->internal
->cuda_device
));
167 av_log(ctx
, AV_LOG_DEBUG
, "CUDA texture alignment: %d\n", priv
->tex_alignment
);
169 // YUV420P is a special case.
170 // Since nvenc expects the U/V planes to have half the linesize of the Y plane
171 // alignment has to be doubled to ensure the U/V planes still end up aligned.
172 if (ctx
->sw_format
== AV_PIX_FMT_YUV420P
)
173 priv
->tex_alignment
*= 2;
175 av_pix_fmt_get_chroma_sub_sample(ctx
->sw_format
, &priv
->shift_width
, &priv
->shift_height
);
178 int size
= av_image_get_buffer_size(ctx
->sw_format
, ctx
->width
, ctx
->height
, priv
->tex_alignment
);
182 ffhwframesctx(ctx
)->pool_internal
=
183 av_buffer_pool_init2(size
, ctx
, cuda_pool_alloc
, NULL
);
184 if (!ffhwframesctx(ctx
)->pool_internal
)
185 return AVERROR(ENOMEM
);
191 static int cuda_get_buffer(AVHWFramesContext
*ctx
, AVFrame
*frame
)
193 CUDAFramesContext
*priv
= ctx
->hwctx
;
196 frame
->buf
[0] = av_buffer_pool_get(ctx
->pool
);
198 return AVERROR(ENOMEM
);
200 res
= av_image_fill_arrays(frame
->data
, frame
->linesize
, frame
->buf
[0]->data
,
201 ctx
->sw_format
, ctx
->width
, ctx
->height
, priv
->tex_alignment
);
205 // YUV420P is a special case.
206 // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned
207 if (ctx
->sw_format
== AV_PIX_FMT_YUV420P
) {
208 frame
->linesize
[1] = frame
->linesize
[2] = frame
->linesize
[0] / 2;
209 frame
->data
[2] = frame
->data
[1];
210 frame
->data
[1] = frame
->data
[2] + frame
->linesize
[2] * (ctx
->height
/ 2);
213 frame
->format
= AV_PIX_FMT_CUDA
;
214 frame
->width
= ctx
->width
;
215 frame
->height
= ctx
->height
;
220 static int cuda_transfer_get_formats(AVHWFramesContext
*ctx
,
221 enum AVHWFrameTransferDirection dir
,
222 enum AVPixelFormat
**formats
)
224 enum AVPixelFormat
*fmts
;
226 fmts
= av_malloc_array(2, sizeof(*fmts
));
228 return AVERROR(ENOMEM
);
230 fmts
[0] = ctx
->sw_format
;
231 fmts
[1] = AV_PIX_FMT_NONE
;
238 static int cuda_transfer_data(AVHWFramesContext
*ctx
, AVFrame
*dst
,
241 CUDAFramesContext
*priv
= ctx
->hwctx
;
242 AVHWDeviceContext
*device_ctx
= ctx
->device_ctx
;
243 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
244 CudaFunctions
*cu
= hwctx
->internal
->cuda_dl
;
249 if ((src
->hw_frames_ctx
&& ((AVHWFramesContext
*)src
->hw_frames_ctx
->data
)->format
!= AV_PIX_FMT_CUDA
) ||
250 (dst
->hw_frames_ctx
&& ((AVHWFramesContext
*)dst
->hw_frames_ctx
->data
)->format
!= AV_PIX_FMT_CUDA
))
251 return AVERROR(ENOSYS
);
253 ret
= CHECK_CU(cu
->cuCtxPushCurrent(hwctx
->cuda_ctx
));
257 for (i
= 0; i
< FF_ARRAY_ELEMS(src
->data
) && src
->data
[i
]; i
++) {
258 CUDA_MEMCPY2D cpy
= {
259 .srcPitch
= src
->linesize
[i
],
260 .dstPitch
= dst
->linesize
[i
],
261 .WidthInBytes
= FFMIN(src
->linesize
[i
], dst
->linesize
[i
]),
262 .Height
= src
->height
>> ((i
== 0 || i
== 3) ? 0 : priv
->shift_height
),
265 if (src
->hw_frames_ctx
) {
266 cpy
.srcMemoryType
= CU_MEMORYTYPE_DEVICE
;
267 cpy
.srcDevice
= (CUdeviceptr
)src
->data
[i
];
269 cpy
.srcMemoryType
= CU_MEMORYTYPE_HOST
;
270 cpy
.srcHost
= src
->data
[i
];
273 if (dst
->hw_frames_ctx
) {
274 cpy
.dstMemoryType
= CU_MEMORYTYPE_DEVICE
;
275 cpy
.dstDevice
= (CUdeviceptr
)dst
->data
[i
];
277 cpy
.dstMemoryType
= CU_MEMORYTYPE_HOST
;
278 cpy
.dstHost
= dst
->data
[i
];
281 ret
= CHECK_CU(cu
->cuMemcpy2DAsync(&cpy
, hwctx
->stream
));
286 if (!dst
->hw_frames_ctx
) {
287 ret
= CHECK_CU(cu
->cuStreamSynchronize(hwctx
->stream
));
293 CHECK_CU(cu
->cuCtxPopCurrent(&dummy
));
298 static void cuda_device_uninit(AVHWDeviceContext
*device_ctx
)
300 CUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
302 if (hwctx
->p
.internal
) {
303 CudaFunctions
*cu
= hwctx
->internal
.cuda_dl
;
305 if (hwctx
->internal
.is_allocated
&& hwctx
->p
.cuda_ctx
) {
306 if (hwctx
->internal
.flags
& AV_CUDA_USE_PRIMARY_CONTEXT
)
307 CHECK_CU(cu
->cuDevicePrimaryCtxRelease(hwctx
->internal
.cuda_device
));
308 else if (!(hwctx
->internal
.flags
& AV_CUDA_USE_CURRENT_CONTEXT
))
309 CHECK_CU(cu
->cuCtxDestroy(hwctx
->p
.cuda_ctx
));
311 hwctx
->p
.cuda_ctx
= NULL
;
314 cuda_free_functions(&hwctx
->internal
.cuda_dl
);
315 memset(&hwctx
->internal
, 0, sizeof(hwctx
->internal
));
316 hwctx
->p
.internal
= NULL
;
320 static int cuda_device_init(AVHWDeviceContext
*ctx
)
322 CUDADeviceContext
*hwctx
= ctx
->hwctx
;
325 hwctx
->p
.internal
= &hwctx
->internal
;
327 if (!hwctx
->internal
.cuda_dl
) {
328 ret
= cuda_load_functions(&hwctx
->internal
.cuda_dl
, ctx
);
330 av_log(ctx
, AV_LOG_ERROR
, "Could not dynamically load CUDA\n");
338 cuda_device_uninit(ctx
);
342 static int cuda_context_init(AVHWDeviceContext
*device_ctx
, int flags
) {
343 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
346 int ret
, dev_active
= 0;
347 unsigned int dev_flags
= 0;
349 const unsigned int desired_flags
= CU_CTX_SCHED_BLOCKING_SYNC
;
351 cu
= hwctx
->internal
->cuda_dl
;
353 hwctx
->internal
->flags
= flags
;
355 if (flags
& AV_CUDA_USE_PRIMARY_CONTEXT
) {
356 ret
= CHECK_CU(cu
->cuDevicePrimaryCtxGetState(hwctx
->internal
->cuda_device
,
357 &dev_flags
, &dev_active
));
361 if (dev_active
&& dev_flags
!= desired_flags
) {
362 av_log(device_ctx
, AV_LOG_ERROR
, "Primary context already active with incompatible flags.\n");
363 return AVERROR(ENOTSUP
);
364 } else if (dev_flags
!= desired_flags
) {
365 ret
= CHECK_CU(cu
->cuDevicePrimaryCtxSetFlags(hwctx
->internal
->cuda_device
,
371 ret
= CHECK_CU(cu
->cuDevicePrimaryCtxRetain(&hwctx
->cuda_ctx
,
372 hwctx
->internal
->cuda_device
));
375 } else if (flags
& AV_CUDA_USE_CURRENT_CONTEXT
) {
376 ret
= CHECK_CU(cu
->cuCtxGetCurrent(&hwctx
->cuda_ctx
));
379 av_log(device_ctx
, AV_LOG_INFO
, "Using current CUDA context.\n");
381 ret
= CHECK_CU(cu
->cuCtxCreate(&hwctx
->cuda_ctx
, desired_flags
,
382 hwctx
->internal
->cuda_device
));
386 CHECK_CU(cu
->cuCtxPopCurrent(&dummy
));
389 hwctx
->internal
->is_allocated
= 1;
391 // Setting stream to NULL will make functions automatically use the default CUstream
392 hwctx
->stream
= NULL
;
397 static int cuda_flags_from_opts(AVHWDeviceContext
*device_ctx
,
398 AVDictionary
*opts
, int *flags
)
400 AVDictionaryEntry
*primary_ctx_opt
= av_dict_get(opts
, "primary_ctx", NULL
, 0);
401 AVDictionaryEntry
*current_ctx_opt
= av_dict_get(opts
, "current_ctx", NULL
, 0);
403 int use_primary_ctx
= 0, use_current_ctx
= 0;
405 use_primary_ctx
= strtol(primary_ctx_opt
->value
, NULL
, 10);
408 use_current_ctx
= strtol(current_ctx_opt
->value
, NULL
, 10);
410 if (use_primary_ctx
&& use_current_ctx
) {
411 av_log(device_ctx
, AV_LOG_ERROR
, "Requested both primary and current CUDA context simultaneously.\n");
412 return AVERROR(EINVAL
);
415 if (primary_ctx_opt
&& use_primary_ctx
) {
416 av_log(device_ctx
, AV_LOG_VERBOSE
, "Using CUDA primary device context\n");
417 *flags
|= AV_CUDA_USE_PRIMARY_CONTEXT
;
418 } else if (primary_ctx_opt
) {
419 av_log(device_ctx
, AV_LOG_VERBOSE
, "Disabling use of CUDA primary device context\n");
420 *flags
&= ~AV_CUDA_USE_PRIMARY_CONTEXT
;
423 if (current_ctx_opt
&& use_current_ctx
) {
424 av_log(device_ctx
, AV_LOG_VERBOSE
, "Using CUDA current device context\n");
425 *flags
|= AV_CUDA_USE_CURRENT_CONTEXT
;
426 } else if (current_ctx_opt
) {
427 av_log(device_ctx
, AV_LOG_VERBOSE
, "Disabling use of CUDA current device context\n");
428 *flags
&= ~AV_CUDA_USE_CURRENT_CONTEXT
;
434 static int cuda_device_create(AVHWDeviceContext
*device_ctx
,
436 AVDictionary
*opts
, int flags
)
438 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
440 int ret
, device_idx
= 0;
442 ret
= cuda_flags_from_opts(device_ctx
, opts
, &flags
);
447 device_idx
= strtol(device
, NULL
, 0);
449 ret
= cuda_device_init(device_ctx
);
453 cu
= hwctx
->internal
->cuda_dl
;
455 ret
= CHECK_CU(cu
->cuInit(0));
459 ret
= CHECK_CU(cu
->cuDeviceGet(&hwctx
->internal
->cuda_device
, device_idx
));
463 ret
= cuda_context_init(device_ctx
, flags
);
470 cuda_device_uninit(device_ctx
);
474 static int cuda_device_derive(AVHWDeviceContext
*device_ctx
,
475 AVHWDeviceContext
*src_ctx
, AVDictionary
*opts
,
477 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
479 const char *src_uuid
= NULL
;
481 VkPhysicalDeviceIDProperties vk_idp
;
483 int ret
, i
, device_count
;
485 ret
= cuda_flags_from_opts(device_ctx
, opts
, &flags
);
490 vk_idp
= (VkPhysicalDeviceIDProperties
) {
491 .sType
= VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES
,
495 switch (src_ctx
->type
) {
497 #define TYPE PFN_vkGetPhysicalDeviceProperties2
498 case AV_HWDEVICE_TYPE_VULKAN
: {
499 AVVulkanDeviceContext
*vkctx
= src_ctx
->hwctx
;
500 TYPE prop_fn
= (TYPE
)vkctx
->get_proc_addr(vkctx
->inst
, "vkGetPhysicalDeviceProperties2");
501 VkPhysicalDeviceProperties2 vk_dev_props
= {
502 .sType
= VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2
,
505 prop_fn(vkctx
->phys_dev
, &vk_dev_props
);
506 src_uuid
= vk_idp
.deviceUUID
;
512 ret
= AVERROR(ENOSYS
);
517 av_log(device_ctx
, AV_LOG_ERROR
,
518 "Failed to get UUID of source device.\n");
519 ret
= AVERROR(EINVAL
);
523 ret
= cuda_device_init(device_ctx
);
527 cu
= hwctx
->internal
->cuda_dl
;
529 ret
= CHECK_CU(cu
->cuInit(0));
533 ret
= CHECK_CU(cu
->cuDeviceGetCount(&device_count
));
537 hwctx
->internal
->cuda_device
= -1;
538 for (i
= 0; i
< device_count
; i
++) {
542 ret
= CHECK_CU(cu
->cuDeviceGet(&dev
, i
));
546 ret
= CHECK_CU(cu
->cuDeviceGetUuid(&uuid
, dev
));
550 if (memcmp(src_uuid
, uuid
.bytes
, sizeof (uuid
.bytes
)) == 0) {
551 hwctx
->internal
->cuda_device
= dev
;
556 if (hwctx
->internal
->cuda_device
== -1) {
557 av_log(device_ctx
, AV_LOG_ERROR
, "Could not derive CUDA device.\n");
561 ret
= cuda_context_init(device_ctx
, flags
);
568 cuda_device_uninit(device_ctx
);
572 const HWContextType ff_hwcontext_type_cuda
= {
573 .type
= AV_HWDEVICE_TYPE_CUDA
,
576 .device_hwctx_size
= sizeof(CUDADeviceContext
),
577 .frames_hwctx_size
= sizeof(CUDAFramesContext
),
579 .device_create
= cuda_device_create
,
580 .device_derive
= cuda_device_derive
,
581 .device_init
= cuda_device_init
,
582 .device_uninit
= cuda_device_uninit
,
583 .frames_get_constraints
= cuda_frames_get_constraints
,
584 .frames_init
= cuda_frames_init
,
585 .frames_get_buffer
= cuda_get_buffer
,
586 .transfer_get_formats
= cuda_transfer_get_formats
,
587 .transfer_data_to
= cuda_transfer_data
,
588 .transfer_data_from
= cuda_transfer_data
,
590 .pix_fmts
= (const enum AVPixelFormat
[]){ AV_PIX_FMT_CUDA
, AV_PIX_FMT_NONE
},