From 4177b020647131be59d85d8b19c8e3143e820b36 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 27 Sep 2021 17:58:35 +0200 Subject: [PATCH] Allow cuda kernel to run in parallell --- sunshine/platform/linux/cuda.cpp | 70 +++++++++++++++++++++++++------- sunshine/platform/linux/cuda.cu | 25 ++++++++++-- sunshine/platform/linux/cuda.h | 25 ++++++++---- sunshine/platform/linux/misc.cpp | 4 -- 4 files changed, 95 insertions(+), 29 deletions(-) diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index 8a25eb27..b96dcf17 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -57,6 +57,10 @@ inline static int check(CUresult result, const std::string_view &sv) { return 0; } +void freeStream(CUstream stream) { + CU_CHECK_IGNORE(cdf->cuStreamDestroy(stream), "Couldn't destroy cuda stream"); +} + class img_t : public platf::img_t { public: tex_t tex; @@ -95,7 +99,8 @@ public: this->hwframe.reset(frame); this->frame = frame; - if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) { + auto hwframe_ctx = (AVHWFramesContext *)frame->hw_frames_ctx->data; + if(hwframe_ctx->sw_format != AV_PIX_FMT_NV12) { BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv; return -1; } @@ -106,6 +111,15 @@ public: return -1; } + auto cuda_ctx = (AVCUDADeviceContext *)hwframe_ctx->device_ctx->hwctx; + + stream = make_stream(); + if(!stream) { + return -1; + } + + cuda_ctx->stream = stream.get(); + auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4); if(!sws_opt) { return -1; @@ -143,13 +157,14 @@ public: return; } - sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 }); + sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), { frame->width, frame->height, 0, 0 }); } cudaTextureObject_t tex_obj(const tex_t &tex) const { return linear_interpolation ? tex.texture.linear : tex.texture.point; } + stream_t stream; frame_t hwframe; int width, height; @@ -163,7 +178,7 @@ public: class cuda_ram_t : public cuda_t { public: int convert(platf::img_t &img) override { - return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex)); + return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get()); } int set_frame(AVFrame *frame) { @@ -187,7 +202,7 @@ public: class cuda_vram_t : public cuda_t { public: int convert(platf::img_t &img) override { - return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex)); + return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex), stream.get()); } }; @@ -257,6 +272,28 @@ int init() { return 0; } +class ctx_t { +public: + ctx_t(NVFBC_SESSION_HANDLE handle) { + NVFBC_BIND_CONTEXT_PARAMS params { NVFBC_BIND_CONTEXT_PARAMS_VER }; + + if(func.nvFBCBindContext(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't bind NvFBC context to current thread: " << func.nvFBCGetLastErrorStr(handle); + } + + this->handle = handle; + } + + ~ctx_t() { + NVFBC_RELEASE_CONTEXT_PARAMS params { NVFBC_RELEASE_CONTEXT_PARAMS_VER }; + if(func.nvFBCReleaseContext(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't release NvFBC context from current thread: " << func.nvFBCGetLastErrorStr(handle); + } + } + + NVFBC_SESSION_HANDLE handle; +}; + class handle_t { enum flag_e { SESSION_HANDLE, @@ -348,24 +385,26 @@ public: return 0; } - ~handle_t() { + int reset() { if(!handle_flags[SESSION_HANDLE]) { - return; + return 0; } - if(handle_flags[SESSION_CAPTURE]) { - NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER }; - - if(func.nvFBCDestroyCaptureSession(handle, ¶ms)) { - BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle); - } - } + stop(); NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER }; if(func.nvFBCDestroyHandle(handle, ¶ms)) { BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle); } + + handle_flags[SESSION_HANDLE] = false; + + return 0; + } + + ~handle_t() { + reset(); } std::bitset handle_flags; @@ -381,6 +420,8 @@ public: return -1; } + ctx_t ctx { handle->handle }; + auto status_params = handle->status(); if(!status_params) { return -1; @@ -443,8 +484,9 @@ public: // Force display_t::capture to initialize handle_t::capture cursor_visible = !*cursor; + ctx_t ctx { handle.handle }; auto fg = util::fail_guard([&]() { - handle.stop(); + handle.reset(); }); while(img) { diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index e93f7d9f..acf2d76d 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -101,6 +101,23 @@ void freeCudaPtr_t::operator()(void *ptr) { CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer"); } +void freeCudaStream_t::operator()(cudaStream_t ptr) { + CU_CHECK_IGNORE(cudaStreamDestroy(ptr), "Couldn't free cuda stream"); +} + +stream_t make_stream(int flags) { + cudaStream_t stream; + + if(!flags) { + CU_CHECK_PTR(cudaStreamCreate(&stream), "Couldn't create cuda stream"); + } + else { + CU_CHECK_PTR(cudaStreamCreateWithFlags(&stream, flags), "Couldn't create cuda stream with flags"); + } + + return stream_t { stream }; +} + inline __device__ float3 bgra_to_rgb(uchar4 vec) { return make_float3((float)vec.z, (float)vec.y, (float)vec.x); } @@ -260,18 +277,18 @@ std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, i return std::make_unique(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr)); } -int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) { - return convert(Y, UV, pitchY, pitchUV, texture, viewport); +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream) { + return convert(Y, UV, pitchY, pitchUV, texture, stream, viewport); } -int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) { +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport) { int threadsX = viewport.width / 2; int threadsY = viewport.height; dim3 block(threadsPerBlock); dim3 grid(div_align(threadsX, threadsPerBlock), threadsY); - RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get()); + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get()); return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); } diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index e46b4759..7e81ae99 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -1,16 +1,17 @@ #if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA) #define SUNSHINE_PLATFORM_CUDA_H -#include #include #include +#include namespace platf { - class hwdevice_t; - class img_t; -} +class hwdevice_t; +class img_t; +} // namespace platf namespace cuda { + namespace nvfbc { std::vector display_names(); } @@ -21,8 +22,10 @@ int init(); typedef struct cudaArray *cudaArray_t; #if !defined(__CUDACC__) +typedef struct CUstream_st *cudaStream_t; typedef unsigned long long cudaTextureObject_t; #else /* defined(__CUDACC__) */ +typedef __location__(device_builtin) struct CUstream_st *cudaStream_t; typedef __location__(device_builtin) unsigned long long cudaTextureObject_t; #endif /* !defined(__CUDACC__) */ @@ -33,7 +36,15 @@ public: void operator()(void *ptr); }; -using ptr_t = std::unique_ptr; +class freeCudaStream_t { +public: + void operator()(cudaStream_t ptr); +}; + +using ptr_t = std::unique_ptr; +using stream_t = std::unique_ptr; + +stream_t make_stream(int flags = 0); struct viewport_t { int width, height; @@ -75,8 +86,8 @@ public: static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height, int pitch); // Converts loaded image into a CUDevicePtr - int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture); - int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport); void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range); diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp index dd114ec4..6d7643c0 100644 --- a/sunshine/platform/linux/misc.cpp +++ b/sunshine/platform/linux/misc.cpp @@ -264,13 +264,11 @@ std::unique_ptr init() { #endif #ifdef SUNSHINE_BUILD_CUDA if(verify_nvfbc()) { - BOOST_LOG(info) << "Using NvFBC for screencasting"sv; sources[source::NVFBC] = true; } #endif #ifdef SUNSHINE_BUILD_WAYLAND if(verify_wl()) { - BOOST_LOG(info) << "Using Wayland for screencasting"sv; sources[source::WAYLAND] = true; } #endif @@ -282,13 +280,11 @@ std::unique_ptr init() { display_cursor = false; } - BOOST_LOG(info) << "Using KMS for screencasting"sv; sources[source::KMS] = true; } #endif #ifdef SUNSHINE_BUILD_X11 if(verify_x11()) { - BOOST_LOG(info) << "Using X11 for screencasting"sv; sources[source::X11] = true; } #endif