Allow cuda kernel to run in parallell
This commit is contained in:
@@ -57,6 +57,10 @@ inline static int check(CUresult result, const std::string_view &sv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void freeStream(CUstream stream) {
|
||||
CU_CHECK_IGNORE(cdf->cuStreamDestroy(stream), "Couldn't destroy cuda stream");
|
||||
}
|
||||
|
||||
class img_t : public platf::img_t {
|
||||
public:
|
||||
tex_t tex;
|
||||
@@ -95,7 +99,8 @@ public:
|
||||
this->hwframe.reset(frame);
|
||||
this->frame = frame;
|
||||
|
||||
if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) {
|
||||
auto hwframe_ctx = (AVHWFramesContext *)frame->hw_frames_ctx->data;
|
||||
if(hwframe_ctx->sw_format != AV_PIX_FMT_NV12) {
|
||||
BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
|
||||
return -1;
|
||||
}
|
||||
@@ -106,6 +111,15 @@ public:
|
||||
return -1;
|
||||
}
|
||||
|
||||
auto cuda_ctx = (AVCUDADeviceContext *)hwframe_ctx->device_ctx->hwctx;
|
||||
|
||||
stream = make_stream();
|
||||
if(!stream) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
cuda_ctx->stream = stream.get();
|
||||
|
||||
auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4);
|
||||
if(!sws_opt) {
|
||||
return -1;
|
||||
@@ -143,13 +157,14 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 });
|
||||
sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), { frame->width, frame->height, 0, 0 });
|
||||
}
|
||||
|
||||
cudaTextureObject_t tex_obj(const tex_t &tex) const {
|
||||
return linear_interpolation ? tex.texture.linear : tex.texture.point;
|
||||
}
|
||||
|
||||
stream_t stream;
|
||||
frame_t hwframe;
|
||||
|
||||
int width, height;
|
||||
@@ -163,7 +178,7 @@ public:
|
||||
class cuda_ram_t : public cuda_t {
|
||||
public:
|
||||
int convert(platf::img_t &img) override {
|
||||
return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex));
|
||||
return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get());
|
||||
}
|
||||
|
||||
int set_frame(AVFrame *frame) {
|
||||
@@ -187,7 +202,7 @@ public:
|
||||
class cuda_vram_t : public cuda_t {
|
||||
public:
|
||||
int convert(platf::img_t &img) override {
|
||||
return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex));
|
||||
return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex), stream.get());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -257,6 +272,28 @@ int init() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
class ctx_t {
|
||||
public:
|
||||
ctx_t(NVFBC_SESSION_HANDLE handle) {
|
||||
NVFBC_BIND_CONTEXT_PARAMS params { NVFBC_BIND_CONTEXT_PARAMS_VER };
|
||||
|
||||
if(func.nvFBCBindContext(handle, ¶ms)) {
|
||||
BOOST_LOG(error) << "Couldn't bind NvFBC context to current thread: " << func.nvFBCGetLastErrorStr(handle);
|
||||
}
|
||||
|
||||
this->handle = handle;
|
||||
}
|
||||
|
||||
~ctx_t() {
|
||||
NVFBC_RELEASE_CONTEXT_PARAMS params { NVFBC_RELEASE_CONTEXT_PARAMS_VER };
|
||||
if(func.nvFBCReleaseContext(handle, ¶ms)) {
|
||||
BOOST_LOG(error) << "Couldn't release NvFBC context from current thread: " << func.nvFBCGetLastErrorStr(handle);
|
||||
}
|
||||
}
|
||||
|
||||
NVFBC_SESSION_HANDLE handle;
|
||||
};
|
||||
|
||||
class handle_t {
|
||||
enum flag_e {
|
||||
SESSION_HANDLE,
|
||||
@@ -348,24 +385,26 @@ public:
|
||||
return 0;
|
||||
}
|
||||
|
||||
~handle_t() {
|
||||
int reset() {
|
||||
if(!handle_flags[SESSION_HANDLE]) {
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(handle_flags[SESSION_CAPTURE]) {
|
||||
NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER };
|
||||
|
||||
if(func.nvFBCDestroyCaptureSession(handle, ¶ms)) {
|
||||
BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle);
|
||||
}
|
||||
}
|
||||
stop();
|
||||
|
||||
NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
|
||||
|
||||
if(func.nvFBCDestroyHandle(handle, ¶ms)) {
|
||||
BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle);
|
||||
}
|
||||
|
||||
handle_flags[SESSION_HANDLE] = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
~handle_t() {
|
||||
reset();
|
||||
}
|
||||
|
||||
std::bitset<MAX_FLAGS> handle_flags;
|
||||
@@ -381,6 +420,8 @@ public:
|
||||
return -1;
|
||||
}
|
||||
|
||||
ctx_t ctx { handle->handle };
|
||||
|
||||
auto status_params = handle->status();
|
||||
if(!status_params) {
|
||||
return -1;
|
||||
@@ -443,8 +484,9 @@ public:
|
||||
// Force display_t::capture to initialize handle_t::capture
|
||||
cursor_visible = !*cursor;
|
||||
|
||||
ctx_t ctx { handle.handle };
|
||||
auto fg = util::fail_guard([&]() {
|
||||
handle.stop();
|
||||
handle.reset();
|
||||
});
|
||||
|
||||
while(img) {
|
||||
|
||||
@@ -101,6 +101,23 @@ void freeCudaPtr_t::operator()(void *ptr) {
|
||||
CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer");
|
||||
}
|
||||
|
||||
void freeCudaStream_t::operator()(cudaStream_t ptr) {
|
||||
CU_CHECK_IGNORE(cudaStreamDestroy(ptr), "Couldn't free cuda stream");
|
||||
}
|
||||
|
||||
stream_t make_stream(int flags) {
|
||||
cudaStream_t stream;
|
||||
|
||||
if(!flags) {
|
||||
CU_CHECK_PTR(cudaStreamCreate(&stream), "Couldn't create cuda stream");
|
||||
}
|
||||
else {
|
||||
CU_CHECK_PTR(cudaStreamCreateWithFlags(&stream, flags), "Couldn't create cuda stream with flags");
|
||||
}
|
||||
|
||||
return stream_t { stream };
|
||||
}
|
||||
|
||||
inline __device__ float3 bgra_to_rgb(uchar4 vec) {
|
||||
return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
|
||||
}
|
||||
@@ -260,18 +277,18 @@ std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, i
|
||||
return std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
|
||||
}
|
||||
|
||||
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
|
||||
return convert(Y, UV, pitchY, pitchUV, texture, viewport);
|
||||
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream) {
|
||||
return convert(Y, UV, pitchY, pitchUV, texture, stream, viewport);
|
||||
}
|
||||
|
||||
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) {
|
||||
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport) {
|
||||
int threadsX = viewport.width / 2;
|
||||
int threadsY = viewport.height;
|
||||
|
||||
dim3 block(threadsPerBlock);
|
||||
dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
|
||||
|
||||
RGBA_to_NV12<<<grid, block>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
|
||||
RGBA_to_NV12<<<grid, block, 0, stream>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
|
||||
|
||||
return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
|
||||
}
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
#if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA)
|
||||
#define SUNSHINE_PLATFORM_CUDA_H
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace platf {
|
||||
class hwdevice_t;
|
||||
class img_t;
|
||||
}
|
||||
class hwdevice_t;
|
||||
class img_t;
|
||||
} // namespace platf
|
||||
|
||||
namespace cuda {
|
||||
|
||||
namespace nvfbc {
|
||||
std::vector<std::string> display_names();
|
||||
}
|
||||
@@ -21,8 +22,10 @@ int init();
|
||||
typedef struct cudaArray *cudaArray_t;
|
||||
|
||||
#if !defined(__CUDACC__)
|
||||
typedef struct CUstream_st *cudaStream_t;
|
||||
typedef unsigned long long cudaTextureObject_t;
|
||||
#else /* defined(__CUDACC__) */
|
||||
typedef __location__(device_builtin) struct CUstream_st *cudaStream_t;
|
||||
typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
|
||||
#endif /* !defined(__CUDACC__) */
|
||||
|
||||
@@ -33,7 +36,15 @@ public:
|
||||
void operator()(void *ptr);
|
||||
};
|
||||
|
||||
using ptr_t = std::unique_ptr<void, freeCudaPtr_t>;
|
||||
class freeCudaStream_t {
|
||||
public:
|
||||
void operator()(cudaStream_t ptr);
|
||||
};
|
||||
|
||||
using ptr_t = std::unique_ptr<void, freeCudaPtr_t>;
|
||||
using stream_t = std::unique_ptr<CUstream_st, freeCudaStream_t>;
|
||||
|
||||
stream_t make_stream(int flags = 0);
|
||||
|
||||
struct viewport_t {
|
||||
int width, height;
|
||||
@@ -75,8 +86,8 @@ public:
|
||||
static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
|
||||
|
||||
// Converts loaded image into a CUDevicePtr
|
||||
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture);
|
||||
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport);
|
||||
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream);
|
||||
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport);
|
||||
|
||||
void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
|
||||
|
||||
|
||||
@@ -264,13 +264,11 @@ std::unique_ptr<deinit_t> init() {
|
||||
#endif
|
||||
#ifdef SUNSHINE_BUILD_CUDA
|
||||
if(verify_nvfbc()) {
|
||||
BOOST_LOG(info) << "Using NvFBC for screencasting"sv;
|
||||
sources[source::NVFBC] = true;
|
||||
}
|
||||
#endif
|
||||
#ifdef SUNSHINE_BUILD_WAYLAND
|
||||
if(verify_wl()) {
|
||||
BOOST_LOG(info) << "Using Wayland for screencasting"sv;
|
||||
sources[source::WAYLAND] = true;
|
||||
}
|
||||
#endif
|
||||
@@ -282,13 +280,11 @@ std::unique_ptr<deinit_t> init() {
|
||||
display_cursor = false;
|
||||
}
|
||||
|
||||
BOOST_LOG(info) << "Using KMS for screencasting"sv;
|
||||
sources[source::KMS] = true;
|
||||
}
|
||||
#endif
|
||||
#ifdef SUNSHINE_BUILD_X11
|
||||
if(verify_x11()) {
|
||||
BOOST_LOG(info) << "Using X11 for screencasting"sv;
|
||||
sources[source::X11] = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user