avfilter/avfiltergraph: fix constant string comparision
[ffmpeg.git] / libavutil / hwcontext_cuda.c
1 /*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 #include "buffer.h"
20 #include "common.h"
21 #include "hwcontext.h"
22 #include "hwcontext_internal.h"
23 #include "hwcontext_cuda_internal.h"
24 #if CONFIG_VULKAN
25 #include "hwcontext_vulkan.h"
26 #endif
27 #include "cuda_check.h"
28 #include "mem.h"
29 #include "pixdesc.h"
30 #include "pixfmt.h"
31 #include "imgutils.h"
32
33 typedef struct CUDAFramesContext {
34 int shift_width, shift_height;
35 int tex_alignment;
36 } CUDAFramesContext;
37
38 typedef struct CUDADeviceContext {
39 AVCUDADeviceContext p;
40 AVCUDADeviceContextInternal internal;
41 } CUDADeviceContext;
42
43 static const enum AVPixelFormat supported_formats[] = {
44 AV_PIX_FMT_NV12,
45 AV_PIX_FMT_NV16,
46 AV_PIX_FMT_YUV420P,
47 AV_PIX_FMT_YUVA420P,
48 AV_PIX_FMT_YUV444P,
49 AV_PIX_FMT_P010,
50 AV_PIX_FMT_P016,
51 AV_PIX_FMT_P210,
52 AV_PIX_FMT_P216,
53 AV_PIX_FMT_YUV422P,
54 AV_PIX_FMT_YUV420P10,
55 AV_PIX_FMT_YUV422P10,
56 AV_PIX_FMT_YUV444P10,
57 AV_PIX_FMT_YUV444P10MSB,
58 AV_PIX_FMT_YUV444P12MSB,
59 AV_PIX_FMT_YUV444P16,
60 AV_PIX_FMT_0RGB32,
61 AV_PIX_FMT_0BGR32,
62 AV_PIX_FMT_RGB32,
63 AV_PIX_FMT_BGR32,
64 #if CONFIG_VULKAN
65 AV_PIX_FMT_VULKAN,
66 #endif
67 };
68
69 #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)
70
71 static int cuda_frames_get_constraints(AVHWDeviceContext *ctx,
72 const void *hwconfig,
73 AVHWFramesConstraints *constraints)
74 {
75 int i;
76
77 constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1,
78 sizeof(*constraints->valid_sw_formats));
79 if (!constraints->valid_sw_formats)
80 return AVERROR(ENOMEM);
81
82 for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
83 constraints->valid_sw_formats[i] = supported_formats[i];
84 constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE;
85
86 constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats));
87 if (!constraints->valid_hw_formats)
88 return AVERROR(ENOMEM);
89
90 constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA;
91 constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE;
92
93 return 0;
94 }
95
96 static void cuda_buffer_free(void *opaque, uint8_t *data)
97 {
98 AVHWFramesContext *ctx = opaque;
99 AVHWDeviceContext *device_ctx = ctx->device_ctx;
100 AVCUDADeviceContext *hwctx = device_ctx->hwctx;
101 CudaFunctions *cu = hwctx->internal->cuda_dl;
102
103 CUcontext dummy;
104
105 CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
106
107 CHECK_CU(cu->cuMemFree((CUdeviceptr)data));
108
109 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
110 }
111
112 static AVBufferRef *cuda_pool_alloc(void *opaque, size_t size)
113 {
114 AVHWFramesContext *ctx = opaque;
115 AVHWDeviceContext *device_ctx = ctx->device_ctx;
116 AVCUDADeviceContext *hwctx = device_ctx->hwctx;
117 CudaFunctions *cu = hwctx->internal->cuda_dl;
118
119 AVBufferRef *ret = NULL;
120 CUcontext dummy = NULL;
121 CUdeviceptr data;
122 int err;
123
124 err = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
125 if (err < 0)
126 return NULL;
127
128 err = CHECK_CU(cu->cuMemAlloc(&data, size));
129 if (err < 0)
130 goto fail;
131
132 ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0);
133 if (!ret) {
134 CHECK_CU(cu->cuMemFree(data));
135 goto fail;
136 }
137
138 fail:
139 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
140 return ret;
141 }
142
143 static int cuda_frames_init(AVHWFramesContext *ctx)
144 {
145 AVHWDeviceContext *device_ctx = ctx->device_ctx;
146 AVCUDADeviceContext *hwctx = device_ctx->hwctx;
147 CUDAFramesContext *priv = ctx->hwctx;
148 CudaFunctions *cu = hwctx->internal->cuda_dl;
149 int err, i;
150
151 for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) {
152 if (ctx->sw_format == supported_formats[i])
153 break;
154 }
155 if (i == FF_ARRAY_ELEMS(supported_formats)) {
156 av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n",
157 av_get_pix_fmt_name(ctx->sw_format));
158 return AVERROR(ENOSYS);
159 }
160
161 err = CHECK_CU(cu->cuDeviceGetAttribute(&priv->tex_alignment,
162 14 /* CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT */,
163 hwctx->internal->cuda_device));
164 if (err < 0)
165 return err;
166
167 av_log(ctx, AV_LOG_DEBUG, "CUDA texture alignment: %d\n", priv->tex_alignment);
168
169 // YUV420P is a special case.
170 // Since nvenc expects the U/V planes to have half the linesize of the Y plane
171 // alignment has to be doubled to ensure the U/V planes still end up aligned.
172 if (ctx->sw_format == AV_PIX_FMT_YUV420P)
173 priv->tex_alignment *= 2;
174
175 av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height);
176
177 if (!ctx->pool) {
178 int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, priv->tex_alignment);
179 if (size < 0)
180 return size;
181
182 ffhwframesctx(ctx)->pool_internal =
183 av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL);
184 if (!ffhwframesctx(ctx)->pool_internal)
185 return AVERROR(ENOMEM);
186 }
187
188 return 0;
189 }
190
191 static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
192 {
193 CUDAFramesContext *priv = ctx->hwctx;
194 int res;
195
196 frame->buf[0] = av_buffer_pool_get(ctx->pool);
197 if (!frame->buf[0])
198 return AVERROR(ENOMEM);
199
200 res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data,
201 ctx->sw_format, ctx->width, ctx->height, priv->tex_alignment);
202 if (res < 0)
203 return res;
204
205 // YUV420P is a special case.
206 // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned
207 if (ctx->sw_format == AV_PIX_FMT_YUV420P) {
208 frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2;
209 frame->data[2] = frame->data[1];
210 frame->data[1] = frame->data[2] + frame->linesize[2] * (ctx->height / 2);
211 }
212
213 frame->format = AV_PIX_FMT_CUDA;
214 frame->width = ctx->width;
215 frame->height = ctx->height;
216
217 return 0;
218 }
219
220 static int cuda_transfer_get_formats(AVHWFramesContext *ctx,
221 enum AVHWFrameTransferDirection dir,
222 enum AVPixelFormat **formats)
223 {
224 enum AVPixelFormat *fmts;
225
226 fmts = av_malloc_array(2, sizeof(*fmts));
227 if (!fmts)
228 return AVERROR(ENOMEM);
229
230 fmts[0] = ctx->sw_format;
231 fmts[1] = AV_PIX_FMT_NONE;
232
233 *formats = fmts;
234
235 return 0;
236 }
237
238 static int cuda_transfer_data(AVHWFramesContext *ctx, AVFrame *dst,
239 const AVFrame *src)
240 {
241 CUDAFramesContext *priv = ctx->hwctx;
242 AVHWDeviceContext *device_ctx = ctx->device_ctx;
243 AVCUDADeviceContext *hwctx = device_ctx->hwctx;
244 CudaFunctions *cu = hwctx->internal->cuda_dl;
245
246 CUcontext dummy;
247 int i, ret;
248
249 if ((src->hw_frames_ctx && ((AVHWFramesContext*)src->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA) ||
250 (dst->hw_frames_ctx && ((AVHWFramesContext*)dst->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA))
251 return AVERROR(ENOSYS);
252
253 ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
254 if (ret < 0)
255 return ret;
256
257 for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
258 CUDA_MEMCPY2D cpy = {
259 .srcPitch = src->linesize[i],
260 .dstPitch = dst->linesize[i],
261 .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]),
262 .Height = src->height >> ((i == 0 || i == 3) ? 0 : priv->shift_height),
263 };
264
265 if (src->hw_frames_ctx) {
266 cpy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
267 cpy.srcDevice = (CUdeviceptr)src->data[i];
268 } else {
269 cpy.srcMemoryType = CU_MEMORYTYPE_HOST;
270 cpy.srcHost = src->data[i];
271 }
272
273 if (dst->hw_frames_ctx) {
274 cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
275 cpy.dstDevice = (CUdeviceptr)dst->data[i];
276 } else {
277 cpy.dstMemoryType = CU_MEMORYTYPE_HOST;
278 cpy.dstHost = dst->data[i];
279 }
280
281 ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream));
282 if (ret < 0)
283 goto exit;
284 }
285
286 if (!dst->hw_frames_ctx) {
287 ret = CHECK_CU(cu->cuStreamSynchronize(hwctx->stream));
288 if (ret < 0)
289 goto exit;
290 }
291
292 exit:
293 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
294
295 return 0;
296 }
297
298 static void cuda_device_uninit(AVHWDeviceContext *device_ctx)
299 {
300 CUDADeviceContext *hwctx = device_ctx->hwctx;
301
302 if (hwctx->p.internal) {
303 CudaFunctions *cu = hwctx->internal.cuda_dl;
304
305 if (hwctx->internal.is_allocated && hwctx->p.cuda_ctx) {
306 if (hwctx->internal.flags & AV_CUDA_USE_PRIMARY_CONTEXT)
307 CHECK_CU(cu->cuDevicePrimaryCtxRelease(hwctx->internal.cuda_device));
308 else if (!(hwctx->internal.flags & AV_CUDA_USE_CURRENT_CONTEXT))
309 CHECK_CU(cu->cuCtxDestroy(hwctx->p.cuda_ctx));
310
311 hwctx->p.cuda_ctx = NULL;
312 }
313
314 cuda_free_functions(&hwctx->internal.cuda_dl);
315 memset(&hwctx->internal, 0, sizeof(hwctx->internal));
316 hwctx->p.internal = NULL;
317 }
318 }
319
320 static int cuda_device_init(AVHWDeviceContext *ctx)
321 {
322 CUDADeviceContext *hwctx = ctx->hwctx;
323 int ret;
324
325 hwctx->p.internal = &hwctx->internal;
326
327 if (!hwctx->internal.cuda_dl) {
328 ret = cuda_load_functions(&hwctx->internal.cuda_dl, ctx);
329 if (ret < 0) {
330 av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n");
331 goto error;
332 }
333 }
334
335 return 0;
336
337 error:
338 cuda_device_uninit(ctx);
339 return ret;
340 }
341
342 static int cuda_context_init(AVHWDeviceContext *device_ctx, int flags) {
343 AVCUDADeviceContext *hwctx = device_ctx->hwctx;
344 CudaFunctions *cu;
345 CUcontext dummy;
346 int ret, dev_active = 0;
347 unsigned int dev_flags = 0;
348
349 const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC;
350
351 cu = hwctx->internal->cuda_dl;
352
353 hwctx->internal->flags = flags;
354
355 if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) {
356 ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device,
357 &dev_flags, &dev_active));
358 if (ret < 0)
359 return ret;
360
361 if (dev_active && dev_flags != desired_flags) {
362 av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n");
363 return AVERROR(ENOTSUP);
364 } else if (dev_flags != desired_flags) {
365 ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device,
366 desired_flags));
367 if (ret < 0)
368 return ret;
369 }
370
371 ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx,
372 hwctx->internal->cuda_device));
373 if (ret < 0)
374 return ret;
375 } else if (flags & AV_CUDA_USE_CURRENT_CONTEXT) {
376 ret = CHECK_CU(cu->cuCtxGetCurrent(&hwctx->cuda_ctx));
377 if (ret < 0)
378 return ret;
379 av_log(device_ctx, AV_LOG_INFO, "Using current CUDA context.\n");
380 } else {
381 ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags,
382 hwctx->internal->cuda_device));
383 if (ret < 0)
384 return ret;
385
386 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
387 }
388
389 hwctx->internal->is_allocated = 1;
390
391 // Setting stream to NULL will make functions automatically use the default CUstream
392 hwctx->stream = NULL;
393
394 return 0;
395 }
396
397 static int cuda_flags_from_opts(AVHWDeviceContext *device_ctx,
398 AVDictionary *opts, int *flags)
399 {
400 AVDictionaryEntry *primary_ctx_opt = av_dict_get(opts, "primary_ctx", NULL, 0);
401 AVDictionaryEntry *current_ctx_opt = av_dict_get(opts, "current_ctx", NULL, 0);
402
403 int use_primary_ctx = 0, use_current_ctx = 0;
404 if (primary_ctx_opt)
405 use_primary_ctx = strtol(primary_ctx_opt->value, NULL, 10);
406
407 if (current_ctx_opt)
408 use_current_ctx = strtol(current_ctx_opt->value, NULL, 10);
409
410 if (use_primary_ctx && use_current_ctx) {
411 av_log(device_ctx, AV_LOG_ERROR, "Requested both primary and current CUDA context simultaneously.\n");
412 return AVERROR(EINVAL);
413 }
414
415 if (primary_ctx_opt && use_primary_ctx) {
416 av_log(device_ctx, AV_LOG_VERBOSE, "Using CUDA primary device context\n");
417 *flags |= AV_CUDA_USE_PRIMARY_CONTEXT;
418 } else if (primary_ctx_opt) {
419 av_log(device_ctx, AV_LOG_VERBOSE, "Disabling use of CUDA primary device context\n");
420 *flags &= ~AV_CUDA_USE_PRIMARY_CONTEXT;
421 }
422
423 if (current_ctx_opt && use_current_ctx) {
424 av_log(device_ctx, AV_LOG_VERBOSE, "Using CUDA current device context\n");
425 *flags |= AV_CUDA_USE_CURRENT_CONTEXT;
426 } else if (current_ctx_opt) {
427 av_log(device_ctx, AV_LOG_VERBOSE, "Disabling use of CUDA current device context\n");
428 *flags &= ~AV_CUDA_USE_CURRENT_CONTEXT;
429 }
430
431 return 0;
432 }
433
434 static int cuda_device_create(AVHWDeviceContext *device_ctx,
435 const char *device,
436 AVDictionary *opts, int flags)
437 {
438 AVCUDADeviceContext *hwctx = device_ctx->hwctx;
439 CudaFunctions *cu;
440 int ret, device_idx = 0;
441
442 ret = cuda_flags_from_opts(device_ctx, opts, &flags);
443 if (ret < 0)
444 goto error;
445
446 if (device)
447 device_idx = strtol(device, NULL, 0);
448
449 ret = cuda_device_init(device_ctx);
450 if (ret < 0)
451 goto error;
452
453 cu = hwctx->internal->cuda_dl;
454
455 ret = CHECK_CU(cu->cuInit(0));
456 if (ret < 0)
457 goto error;
458
459 ret = CHECK_CU(cu->cuDeviceGet(&hwctx->internal->cuda_device, device_idx));
460 if (ret < 0)
461 goto error;
462
463 ret = cuda_context_init(device_ctx, flags);
464 if (ret < 0)
465 goto error;
466
467 return 0;
468
469 error:
470 cuda_device_uninit(device_ctx);
471 return ret;
472 }
473
474 static int cuda_device_derive(AVHWDeviceContext *device_ctx,
475 AVHWDeviceContext *src_ctx, AVDictionary *opts,
476 int flags) {
477 AVCUDADeviceContext *hwctx = device_ctx->hwctx;
478 CudaFunctions *cu;
479 const char *src_uuid = NULL;
480 #if CONFIG_VULKAN
481 VkPhysicalDeviceIDProperties vk_idp;
482 #endif
483 int ret, i, device_count;
484
485 ret = cuda_flags_from_opts(device_ctx, opts, &flags);
486 if (ret < 0)
487 goto error;
488
489 #if CONFIG_VULKAN
490 vk_idp = (VkPhysicalDeviceIDProperties) {
491 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES,
492 };
493 #endif
494
495 switch (src_ctx->type) {
496 #if CONFIG_VULKAN
497 #define TYPE PFN_vkGetPhysicalDeviceProperties2
498 case AV_HWDEVICE_TYPE_VULKAN: {
499 AVVulkanDeviceContext *vkctx = src_ctx->hwctx;
500 TYPE prop_fn = (TYPE)vkctx->get_proc_addr(vkctx->inst, "vkGetPhysicalDeviceProperties2");
501 VkPhysicalDeviceProperties2 vk_dev_props = {
502 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
503 .pNext = &vk_idp,
504 };
505 prop_fn(vkctx->phys_dev, &vk_dev_props);
506 src_uuid = vk_idp.deviceUUID;
507 break;
508 }
509 #undef TYPE
510 #endif
511 default:
512 ret = AVERROR(ENOSYS);
513 goto error;
514 }
515
516 if (!src_uuid) {
517 av_log(device_ctx, AV_LOG_ERROR,
518 "Failed to get UUID of source device.\n");
519 ret = AVERROR(EINVAL);
520 goto error;
521 }
522
523 ret = cuda_device_init(device_ctx);
524 if (ret < 0)
525 goto error;
526
527 cu = hwctx->internal->cuda_dl;
528
529 ret = CHECK_CU(cu->cuInit(0));
530 if (ret < 0)
531 goto error;
532
533 ret = CHECK_CU(cu->cuDeviceGetCount(&device_count));
534 if (ret < 0)
535 goto error;
536
537 hwctx->internal->cuda_device = -1;
538 for (i = 0; i < device_count; i++) {
539 CUdevice dev;
540 CUuuid uuid;
541
542 ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
543 if (ret < 0)
544 goto error;
545
546 ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
547 if (ret < 0)
548 goto error;
549
550 if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) {
551 hwctx->internal->cuda_device = dev;
552 break;
553 }
554 }
555
556 if (hwctx->internal->cuda_device == -1) {
557 av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n");
558 goto error;
559 }
560
561 ret = cuda_context_init(device_ctx, flags);
562 if (ret < 0)
563 goto error;
564
565 return 0;
566
567 error:
568 cuda_device_uninit(device_ctx);
569 return ret;
570 }
571
572 const HWContextType ff_hwcontext_type_cuda = {
573 .type = AV_HWDEVICE_TYPE_CUDA,
574 .name = "CUDA",
575
576 .device_hwctx_size = sizeof(CUDADeviceContext),
577 .frames_hwctx_size = sizeof(CUDAFramesContext),
578
579 .device_create = cuda_device_create,
580 .device_derive = cuda_device_derive,
581 .device_init = cuda_device_init,
582 .device_uninit = cuda_device_uninit,
583 .frames_get_constraints = cuda_frames_get_constraints,
584 .frames_init = cuda_frames_init,
585 .frames_get_buffer = cuda_get_buffer,
586 .transfer_get_formats = cuda_transfer_get_formats,
587 .transfer_data_to = cuda_transfer_data,
588 .transfer_data_from = cuda_transfer_data,
589
590 .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE },
591 };