Allow cuda kernel to run in parallell
This commit is contained in:
@@ -57,6 +57,10 @@ inline static int check(CUresult result, const std::string_view &sv) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void freeStream(CUstream stream) {
|
||||||
|
CU_CHECK_IGNORE(cdf->cuStreamDestroy(stream), "Couldn't destroy cuda stream");
|
||||||
|
}
|
||||||
|
|
||||||
class img_t : public platf::img_t {
|
class img_t : public platf::img_t {
|
||||||
public:
|
public:
|
||||||
tex_t tex;
|
tex_t tex;
|
||||||
@@ -95,7 +99,8 @@ public:
|
|||||||
this->hwframe.reset(frame);
|
this->hwframe.reset(frame);
|
||||||
this->frame = frame;
|
this->frame = frame;
|
||||||
|
|
||||||
if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) {
|
auto hwframe_ctx = (AVHWFramesContext *)frame->hw_frames_ctx->data;
|
||||||
|
if(hwframe_ctx->sw_format != AV_PIX_FMT_NV12) {
|
||||||
BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
|
BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@@ -106,6 +111,15 @@ public:
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto cuda_ctx = (AVCUDADeviceContext *)hwframe_ctx->device_ctx->hwctx;
|
||||||
|
|
||||||
|
stream = make_stream();
|
||||||
|
if(!stream) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
cuda_ctx->stream = stream.get();
|
||||||
|
|
||||||
auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4);
|
auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4);
|
||||||
if(!sws_opt) {
|
if(!sws_opt) {
|
||||||
return -1;
|
return -1;
|
||||||
@@ -143,13 +157,14 @@ public:
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 });
|
sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), { frame->width, frame->height, 0, 0 });
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaTextureObject_t tex_obj(const tex_t &tex) const {
|
cudaTextureObject_t tex_obj(const tex_t &tex) const {
|
||||||
return linear_interpolation ? tex.texture.linear : tex.texture.point;
|
return linear_interpolation ? tex.texture.linear : tex.texture.point;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
stream_t stream;
|
||||||
frame_t hwframe;
|
frame_t hwframe;
|
||||||
|
|
||||||
int width, height;
|
int width, height;
|
||||||
@@ -163,7 +178,7 @@ public:
|
|||||||
class cuda_ram_t : public cuda_t {
|
class cuda_ram_t : public cuda_t {
|
||||||
public:
|
public:
|
||||||
int convert(platf::img_t &img) override {
|
int convert(platf::img_t &img) override {
|
||||||
return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex));
|
return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
int set_frame(AVFrame *frame) {
|
int set_frame(AVFrame *frame) {
|
||||||
@@ -187,7 +202,7 @@ public:
|
|||||||
class cuda_vram_t : public cuda_t {
|
class cuda_vram_t : public cuda_t {
|
||||||
public:
|
public:
|
||||||
int convert(platf::img_t &img) override {
|
int convert(platf::img_t &img) override {
|
||||||
return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex));
|
return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex), stream.get());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -257,6 +272,28 @@ int init() {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class ctx_t {
|
||||||
|
public:
|
||||||
|
ctx_t(NVFBC_SESSION_HANDLE handle) {
|
||||||
|
NVFBC_BIND_CONTEXT_PARAMS params { NVFBC_BIND_CONTEXT_PARAMS_VER };
|
||||||
|
|
||||||
|
if(func.nvFBCBindContext(handle, ¶ms)) {
|
||||||
|
BOOST_LOG(error) << "Couldn't bind NvFBC context to current thread: " << func.nvFBCGetLastErrorStr(handle);
|
||||||
|
}
|
||||||
|
|
||||||
|
this->handle = handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
~ctx_t() {
|
||||||
|
NVFBC_RELEASE_CONTEXT_PARAMS params { NVFBC_RELEASE_CONTEXT_PARAMS_VER };
|
||||||
|
if(func.nvFBCReleaseContext(handle, ¶ms)) {
|
||||||
|
BOOST_LOG(error) << "Couldn't release NvFBC context from current thread: " << func.nvFBCGetLastErrorStr(handle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NVFBC_SESSION_HANDLE handle;
|
||||||
|
};
|
||||||
|
|
||||||
class handle_t {
|
class handle_t {
|
||||||
enum flag_e {
|
enum flag_e {
|
||||||
SESSION_HANDLE,
|
SESSION_HANDLE,
|
||||||
@@ -348,24 +385,26 @@ public:
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
~handle_t() {
|
int reset() {
|
||||||
if(!handle_flags[SESSION_HANDLE]) {
|
if(!handle_flags[SESSION_HANDLE]) {
|
||||||
return;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(handle_flags[SESSION_CAPTURE]) {
|
stop();
|
||||||
NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER };
|
|
||||||
|
|
||||||
if(func.nvFBCDestroyCaptureSession(handle, ¶ms)) {
|
|
||||||
BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
|
NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
|
||||||
|
|
||||||
if(func.nvFBCDestroyHandle(handle, ¶ms)) {
|
if(func.nvFBCDestroyHandle(handle, ¶ms)) {
|
||||||
BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle);
|
BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
handle_flags[SESSION_HANDLE] = false;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
~handle_t() {
|
||||||
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::bitset<MAX_FLAGS> handle_flags;
|
std::bitset<MAX_FLAGS> handle_flags;
|
||||||
@@ -381,6 +420,8 @@ public:
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx_t ctx { handle->handle };
|
||||||
|
|
||||||
auto status_params = handle->status();
|
auto status_params = handle->status();
|
||||||
if(!status_params) {
|
if(!status_params) {
|
||||||
return -1;
|
return -1;
|
||||||
@@ -443,8 +484,9 @@ public:
|
|||||||
// Force display_t::capture to initialize handle_t::capture
|
// Force display_t::capture to initialize handle_t::capture
|
||||||
cursor_visible = !*cursor;
|
cursor_visible = !*cursor;
|
||||||
|
|
||||||
|
ctx_t ctx { handle.handle };
|
||||||
auto fg = util::fail_guard([&]() {
|
auto fg = util::fail_guard([&]() {
|
||||||
handle.stop();
|
handle.reset();
|
||||||
});
|
});
|
||||||
|
|
||||||
while(img) {
|
while(img) {
|
||||||
|
|||||||
@@ -101,6 +101,23 @@ void freeCudaPtr_t::operator()(void *ptr) {
|
|||||||
CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer");
|
CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void freeCudaStream_t::operator()(cudaStream_t ptr) {
|
||||||
|
CU_CHECK_IGNORE(cudaStreamDestroy(ptr), "Couldn't free cuda stream");
|
||||||
|
}
|
||||||
|
|
||||||
|
stream_t make_stream(int flags) {
|
||||||
|
cudaStream_t stream;
|
||||||
|
|
||||||
|
if(!flags) {
|
||||||
|
CU_CHECK_PTR(cudaStreamCreate(&stream), "Couldn't create cuda stream");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
CU_CHECK_PTR(cudaStreamCreateWithFlags(&stream, flags), "Couldn't create cuda stream with flags");
|
||||||
|
}
|
||||||
|
|
||||||
|
return stream_t { stream };
|
||||||
|
}
|
||||||
|
|
||||||
inline __device__ float3 bgra_to_rgb(uchar4 vec) {
|
inline __device__ float3 bgra_to_rgb(uchar4 vec) {
|
||||||
return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
|
return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
|
||||||
}
|
}
|
||||||
@@ -260,18 +277,18 @@ std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, i
|
|||||||
return std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
|
return std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
|
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream) {
|
||||||
return convert(Y, UV, pitchY, pitchUV, texture, viewport);
|
return convert(Y, UV, pitchY, pitchUV, texture, stream, viewport);
|
||||||
}
|
}
|
||||||
|
|
||||||
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) {
|
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport) {
|
||||||
int threadsX = viewport.width / 2;
|
int threadsX = viewport.width / 2;
|
||||||
int threadsY = viewport.height;
|
int threadsY = viewport.height;
|
||||||
|
|
||||||
dim3 block(threadsPerBlock);
|
dim3 block(threadsPerBlock);
|
||||||
dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
|
dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
|
||||||
|
|
||||||
RGBA_to_NV12<<<grid, block>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
|
RGBA_to_NV12<<<grid, block, 0, stream>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
|
||||||
|
|
||||||
return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
|
return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,16 +1,17 @@
|
|||||||
#if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA)
|
#if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA)
|
||||||
#define SUNSHINE_PLATFORM_CUDA_H
|
#define SUNSHINE_PLATFORM_CUDA_H
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
namespace platf {
|
namespace platf {
|
||||||
class hwdevice_t;
|
class hwdevice_t;
|
||||||
class img_t;
|
class img_t;
|
||||||
}
|
} // namespace platf
|
||||||
|
|
||||||
namespace cuda {
|
namespace cuda {
|
||||||
|
|
||||||
namespace nvfbc {
|
namespace nvfbc {
|
||||||
std::vector<std::string> display_names();
|
std::vector<std::string> display_names();
|
||||||
}
|
}
|
||||||
@@ -21,8 +22,10 @@ int init();
|
|||||||
typedef struct cudaArray *cudaArray_t;
|
typedef struct cudaArray *cudaArray_t;
|
||||||
|
|
||||||
#if !defined(__CUDACC__)
|
#if !defined(__CUDACC__)
|
||||||
|
typedef struct CUstream_st *cudaStream_t;
|
||||||
typedef unsigned long long cudaTextureObject_t;
|
typedef unsigned long long cudaTextureObject_t;
|
||||||
#else /* defined(__CUDACC__) */
|
#else /* defined(__CUDACC__) */
|
||||||
|
typedef __location__(device_builtin) struct CUstream_st *cudaStream_t;
|
||||||
typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
|
typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
|
||||||
#endif /* !defined(__CUDACC__) */
|
#endif /* !defined(__CUDACC__) */
|
||||||
|
|
||||||
@@ -33,7 +36,15 @@ public:
|
|||||||
void operator()(void *ptr);
|
void operator()(void *ptr);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class freeCudaStream_t {
|
||||||
|
public:
|
||||||
|
void operator()(cudaStream_t ptr);
|
||||||
|
};
|
||||||
|
|
||||||
using ptr_t = std::unique_ptr<void, freeCudaPtr_t>;
|
using ptr_t = std::unique_ptr<void, freeCudaPtr_t>;
|
||||||
|
using stream_t = std::unique_ptr<CUstream_st, freeCudaStream_t>;
|
||||||
|
|
||||||
|
stream_t make_stream(int flags = 0);
|
||||||
|
|
||||||
struct viewport_t {
|
struct viewport_t {
|
||||||
int width, height;
|
int width, height;
|
||||||
@@ -75,8 +86,8 @@ public:
|
|||||||
static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
|
static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
|
||||||
|
|
||||||
// Converts loaded image into a CUDevicePtr
|
// Converts loaded image into a CUDevicePtr
|
||||||
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture);
|
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream);
|
||||||
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport);
|
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport);
|
||||||
|
|
||||||
void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
|
void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
|
||||||
|
|
||||||
|
|||||||
@@ -264,13 +264,11 @@ std::unique_ptr<deinit_t> init() {
|
|||||||
#endif
|
#endif
|
||||||
#ifdef SUNSHINE_BUILD_CUDA
|
#ifdef SUNSHINE_BUILD_CUDA
|
||||||
if(verify_nvfbc()) {
|
if(verify_nvfbc()) {
|
||||||
BOOST_LOG(info) << "Using NvFBC for screencasting"sv;
|
|
||||||
sources[source::NVFBC] = true;
|
sources[source::NVFBC] = true;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#ifdef SUNSHINE_BUILD_WAYLAND
|
#ifdef SUNSHINE_BUILD_WAYLAND
|
||||||
if(verify_wl()) {
|
if(verify_wl()) {
|
||||||
BOOST_LOG(info) << "Using Wayland for screencasting"sv;
|
|
||||||
sources[source::WAYLAND] = true;
|
sources[source::WAYLAND] = true;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@@ -282,13 +280,11 @@ std::unique_ptr<deinit_t> init() {
|
|||||||
display_cursor = false;
|
display_cursor = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_LOG(info) << "Using KMS for screencasting"sv;
|
|
||||||
sources[source::KMS] = true;
|
sources[source::KMS] = true;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#ifdef SUNSHINE_BUILD_X11
|
#ifdef SUNSHINE_BUILD_X11
|
||||||
if(verify_x11()) {
|
if(verify_x11()) {
|
||||||
BOOST_LOG(info) << "Using X11 for screencasting"sv;
|
|
||||||
sources[source::X11] = true;
|
sources[source::X11] = true;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user