From 4177b020647131be59d85d8b19c8e3143e820b36 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Mon, 27 Sep 2021 17:58:35 +0200
Subject: [PATCH] Allow cuda kernel to run in parallell

---
 sunshine/platform/linux/cuda.cpp | 70 +++++++++++++++++++++++++-------
 sunshine/platform/linux/cuda.cu  | 25 ++++++++++--
 sunshine/platform/linux/cuda.h   | 25 ++++++++----
 sunshine/platform/linux/misc.cpp |  4 --
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index 8a25eb27..b96dcf17 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -57,6 +57,10 @@ inline static int check(CUresult result, const std::string_view &sv) {
   return 0;
 }
 
+void freeStream(CUstream stream) {
+  CU_CHECK_IGNORE(cdf->cuStreamDestroy(stream), "Couldn't destroy cuda stream");
+}
+
 class img_t : public platf::img_t {
 public:
   tex_t tex;
@@ -95,7 +99,8 @@ public:
     this->hwframe.reset(frame);
     this->frame = frame;
 
-    if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) {
+    auto hwframe_ctx = (AVHWFramesContext *)frame->hw_frames_ctx->data;
+    if(hwframe_ctx->sw_format != AV_PIX_FMT_NV12) {
       BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
       return -1;
     }
@@ -106,6 +111,15 @@ public:
       return -1;
     }
 
+    auto cuda_ctx = (AVCUDADeviceContext *)hwframe_ctx->device_ctx->hwctx;
+
+    stream = make_stream();
+    if(!stream) {
+      return -1;
+    }
+
+    cuda_ctx->stream = stream.get();
+
     auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4);
     if(!sws_opt) {
       return -1;
@@ -143,13 +157,14 @@ public:
       return;
     }
 
-    sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 });
+    sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), { frame->width, frame->height, 0, 0 });
   }
 
   cudaTextureObject_t tex_obj(const tex_t &tex) const {
     return linear_interpolation ? tex.texture.linear : tex.texture.point;
   }
 
+  stream_t stream;
   frame_t hwframe;
 
   int width, height;
@@ -163,7 +178,7 @@ public:
 class cuda_ram_t : public cuda_t {
 public:
   int convert(platf::img_t &img) override {
-    return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex));
+    return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get());
   }
 
   int set_frame(AVFrame *frame) {
@@ -187,7 +202,7 @@ public:
 class cuda_vram_t : public cuda_t {
 public:
   int convert(platf::img_t &img) override {
-    return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex));
+    return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex), stream.get());
   }
 };
 
@@ -257,6 +272,28 @@ int init() {
   return 0;
 }
 
+class ctx_t {
+public:
+  ctx_t(NVFBC_SESSION_HANDLE handle) {
+    NVFBC_BIND_CONTEXT_PARAMS params { NVFBC_BIND_CONTEXT_PARAMS_VER };
+
+    if(func.nvFBCBindContext(handle, &params)) {
+      BOOST_LOG(error) << "Couldn't bind NvFBC context to current thread: " << func.nvFBCGetLastErrorStr(handle);
+    }
+
+    this->handle = handle;
+  }
+
+  ~ctx_t() {
+    NVFBC_RELEASE_CONTEXT_PARAMS params { NVFBC_RELEASE_CONTEXT_PARAMS_VER };
+    if(func.nvFBCReleaseContext(handle, &params)) {
+      BOOST_LOG(error) << "Couldn't release NvFBC context from current thread: " << func.nvFBCGetLastErrorStr(handle);
+    }
+  }
+
+  NVFBC_SESSION_HANDLE handle;
+};
+
 class handle_t {
   enum flag_e {
     SESSION_HANDLE,
@@ -348,24 +385,26 @@ public:
     return 0;
   }
 
-  ~handle_t() {
+  int reset() {
     if(!handle_flags[SESSION_HANDLE]) {
-      return;
+      return 0;
     }
 
-    if(handle_flags[SESSION_CAPTURE]) {
-      NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER };
-
-      if(func.nvFBCDestroyCaptureSession(handle, &params)) {
-        BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle);
-      }
-    }
+    stop();
 
     NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
 
     if(func.nvFBCDestroyHandle(handle, &params)) {
       BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle);
     }
+
+    handle_flags[SESSION_HANDLE] = false;
+
+    return 0;
+  }
+
+  ~handle_t() {
+    reset();
   }
 
   std::bitset<MAX_FLAGS> handle_flags;
@@ -381,6 +420,8 @@ public:
       return -1;
     }
 
+    ctx_t ctx { handle->handle };
+
     auto status_params = handle->status();
     if(!status_params) {
       return -1;
@@ -443,8 +484,9 @@ public:
     // Force display_t::capture to initialize handle_t::capture
     cursor_visible = !*cursor;
 
+    ctx_t ctx { handle.handle };
     auto fg = util::fail_guard([&]() {
-      handle.stop();
+      handle.reset();
     });
 
     while(img) {
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index e93f7d9f..acf2d76d 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -101,6 +101,23 @@ void freeCudaPtr_t::operator()(void *ptr) {
   CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer");
 }
 
+void freeCudaStream_t::operator()(cudaStream_t ptr) {
+  CU_CHECK_IGNORE(cudaStreamDestroy(ptr), "Couldn't free cuda stream");
+}
+
+stream_t make_stream(int flags) {
+  cudaStream_t stream;
+
+  if(!flags) {
+    CU_CHECK_PTR(cudaStreamCreate(&stream), "Couldn't create cuda stream");
+  }
+  else {
+    CU_CHECK_PTR(cudaStreamCreateWithFlags(&stream, flags), "Couldn't create cuda stream with flags");
+  }
+
+  return stream_t { stream };
+}
+
 inline __device__ float3 bgra_to_rgb(uchar4 vec) {
   return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
 }
@@ -260,18 +277,18 @@ std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, i
   return std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
 }
 
-int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
-  return convert(Y, UV, pitchY, pitchUV, texture, viewport);
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream) {
+  return convert(Y, UV, pitchY, pitchUV, texture, stream, viewport);
 }
 
-int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) {
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport) {
   int threadsX = viewport.width / 2;
   int threadsY = viewport.height;
 
   dim3 block(threadsPerBlock);
   dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
 
-  RGBA_to_NV12<<<grid, block>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
+  RGBA_to_NV12<<<grid, block, 0, stream>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
 
   return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
 }
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index e46b4759..7e81ae99 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,16 +1,17 @@
 #if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA)
 #define SUNSHINE_PLATFORM_CUDA_H
 
-#include <vector>
 #include <memory>
 #include <string>
+#include <vector>
 
 namespace platf {
-  class hwdevice_t;
-  class img_t;
-}
+class hwdevice_t;
+class img_t;
+} // namespace platf
 
 namespace cuda {
+
 namespace nvfbc {
 std::vector<std::string> display_names();
 }
@@ -21,8 +22,10 @@ int init();
 typedef struct cudaArray *cudaArray_t;
 
 #if !defined(__CUDACC__)
+typedef struct CUstream_st *cudaStream_t;
 typedef unsigned long long cudaTextureObject_t;
 #else  /* defined(__CUDACC__) */
+typedef __location__(device_builtin) struct CUstream_st *cudaStream_t;
 typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
 #endif /* !defined(__CUDACC__) */
 
@@ -33,7 +36,15 @@ public:
   void operator()(void *ptr);
 };
 
-using ptr_t = std::unique_ptr<void, freeCudaPtr_t>;
+class freeCudaStream_t {
+public:
+  void operator()(cudaStream_t ptr);
+};
+
+using ptr_t    = std::unique_ptr<void, freeCudaPtr_t>;
+using stream_t = std::unique_ptr<CUstream_st, freeCudaStream_t>;
+
+stream_t make_stream(int flags = 0);
 
 struct viewport_t {
   int width, height;
@@ -75,8 +86,8 @@ public:
   static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
 
   // Converts loaded image into a CUDevicePtr
-  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture);
-  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport);
 
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
 
diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp
index dd114ec4..6d7643c0 100644
--- a/sunshine/platform/linux/misc.cpp
+++ b/sunshine/platform/linux/misc.cpp
@@ -264,13 +264,11 @@ std::unique_ptr<deinit_t> init() {
 #endif
 #ifdef SUNSHINE_BUILD_CUDA
   if(verify_nvfbc()) {
-    BOOST_LOG(info) << "Using NvFBC for screencasting"sv;
     sources[source::NVFBC] = true;
   }
 #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   if(verify_wl()) {
-    BOOST_LOG(info) << "Using Wayland for screencasting"sv;
     sources[source::WAYLAND] = true;
   }
 #endif
@@ -282,13 +280,11 @@ std::unique_ptr<deinit_t> init() {
       display_cursor = false;
     }
 
-    BOOST_LOG(info) << "Using KMS for screencasting"sv;
     sources[source::KMS] = true;
   }
 #endif
 #ifdef SUNSHINE_BUILD_X11
   if(verify_x11()) {
-    BOOST_LOG(info) << "Using X11 for screencasting"sv;
     sources[source::X11] = true;
   }
 #endif