Use an actual cuda kernel to convert RGB to NV12
This commit is contained in:
@@ -1,9 +1,4 @@
|
||||
#include "cuda.h"
|
||||
#include "graphics.h"
|
||||
#include "sunshine/main.h"
|
||||
#include "sunshine/utility.h"
|
||||
#include "wayland.h"
|
||||
#include "x11grab.h"
|
||||
#include <NvFBC.h>
|
||||
#include <ffnvcodec/dynlink_loader.h>
|
||||
|
||||
extern "C" {
|
||||
@@ -12,6 +7,13 @@ extern "C" {
|
||||
#include <libavutil/imgutils.h>
|
||||
}
|
||||
|
||||
#include "cuda.h"
|
||||
#include "graphics.h"
|
||||
#include "sunshine/main.h"
|
||||
#include "sunshine/utility.h"
|
||||
#include "wayland.h"
|
||||
#include "x11grab.h"
|
||||
|
||||
#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
|
||||
#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
|
||||
|
||||
@@ -23,6 +25,13 @@ extern "C" {
|
||||
|
||||
using namespace std::literals;
|
||||
namespace cuda {
|
||||
constexpr auto cudaDevAttrMaxThreadsPerBlock = (CUdevice_attribute)1;
|
||||
constexpr auto cudaDevAttrMaxThreadsPerMultiProcessor = (CUdevice_attribute)39;
|
||||
|
||||
void pass_error(const std::string_view &sv, const char *name, const char *description) {
|
||||
BOOST_LOG(error) << sv << name << ':' << description;
|
||||
}
|
||||
|
||||
void cff(CudaFunctions *cf) {
|
||||
cuda_free_functions(&cf);
|
||||
}
|
||||
@@ -151,7 +160,7 @@ int init() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
class cuda_t : public platf::hwdevice_t {
|
||||
class opengl_t : public platf::hwdevice_t {
|
||||
public:
|
||||
int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
|
||||
if(!cdf) {
|
||||
@@ -273,16 +282,203 @@ public:
|
||||
int width, height;
|
||||
};
|
||||
|
||||
class cuda_t : public platf::hwdevice_t {
|
||||
public:
|
||||
~cuda_t() override {
|
||||
// sws_t needs to be destroyed while the context is active
|
||||
if(sws) {
|
||||
ctx_t ctx { cuda_ctx };
|
||||
|
||||
sws.reset();
|
||||
}
|
||||
}
|
||||
|
||||
int init(int in_width, int in_height) {
|
||||
if(!cdf) {
|
||||
BOOST_LOG(warning) << "cuda not initialized"sv;
|
||||
return -1;
|
||||
}
|
||||
|
||||
data = (void *)0x1;
|
||||
|
||||
width = in_width;
|
||||
height = in_height;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int set_frame(AVFrame *frame) override {
|
||||
this->hwframe.reset(frame);
|
||||
this->frame = frame;
|
||||
|
||||
if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) {
|
||||
BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
|
||||
BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx;
|
||||
|
||||
ctx_t ctx { cuda_ctx };
|
||||
sws = sws_t::make(width * 4, height, frame->width, frame->height);
|
||||
|
||||
if(!sws) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int convert(platf::img_t &img) override {
|
||||
ctx_t ctx { cuda_ctx };
|
||||
|
||||
return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]);
|
||||
}
|
||||
|
||||
void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
|
||||
ctx_t ctx { cuda_ctx };
|
||||
sws->set_colorspace(colorspace, color_range);
|
||||
}
|
||||
|
||||
frame_t hwframe;
|
||||
|
||||
std::unique_ptr<sws_t> sws;
|
||||
|
||||
int width, height;
|
||||
|
||||
CUcontext cuda_ctx;
|
||||
};
|
||||
|
||||
std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
|
||||
if(init()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto cuda = std::make_shared<cuda_t>();
|
||||
if(cuda->init(width, height, xdisplay)) {
|
||||
if(cuda->init(width, height)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return cuda;
|
||||
}
|
||||
} // namespace cuda
|
||||
|
||||
namespace platf::nvfbc {
|
||||
static PNVFBCCREATEINSTANCE createInstance {};
|
||||
static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION };
|
||||
|
||||
static void *handle { nullptr };
|
||||
int init() {
|
||||
static bool funcs_loaded = false;
|
||||
|
||||
if(funcs_loaded) return 0;
|
||||
|
||||
if(!handle) {
|
||||
handle = dyn::handle({ "libnvidia-fbc.so.1", "libnvidia-fbc.so" });
|
||||
if(!handle) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::tuple<dyn::apiproc *, const char *>> funcs {
|
||||
{ (dyn::apiproc *)&createInstance, "NvFBCCreateInstance" },
|
||||
};
|
||||
|
||||
if(dyn::load(handle, funcs)) {
|
||||
dlclose(handle);
|
||||
handle = nullptr;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
funcs_loaded = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
class handle_t {
|
||||
KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits<std::uint64_t>::max(), {
|
||||
if(el == std::numeric_limits<std::uint64_t>::max()) {
|
||||
return;
|
||||
}
|
||||
NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
|
||||
|
||||
auto status = func.nvFBCDestroyHandle(el, ¶ms);
|
||||
if(status) {
|
||||
BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el);
|
||||
}
|
||||
});
|
||||
|
||||
public:
|
||||
static std::optional<handle_t> make() {
|
||||
NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER };
|
||||
session_t session;
|
||||
|
||||
auto status = func.nvFBCCreateHandle(&session.el, ¶ms);
|
||||
if(status) {
|
||||
BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el);
|
||||
session.release();
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
return handle_t { std::move(session) };
|
||||
}
|
||||
|
||||
const char *last_error() {
|
||||
return func.nvFBCGetLastErrorStr(session.el);
|
||||
}
|
||||
|
||||
std::optional<NVFBC_GET_STATUS_PARAMS> status() {
|
||||
NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER };
|
||||
|
||||
auto status = func.nvFBCGetStatus(session.el, ¶ms);
|
||||
if(status) {
|
||||
BOOST_LOG(error) << "Failed to create session: "sv << last_error();
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
session_t session;
|
||||
};
|
||||
|
||||
std::vector<std::string> nvfbc_display_names() {
|
||||
if(init()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<std::string> display_names;
|
||||
|
||||
auto status = createInstance(&func);
|
||||
if(status) {
|
||||
BOOST_LOG(error) << "Unable to create NvFBC instance"sv;
|
||||
return {};
|
||||
}
|
||||
|
||||
auto handle = handle_t::make();
|
||||
if(!handle) {
|
||||
return {};
|
||||
}
|
||||
|
||||
auto status_params = handle->status();
|
||||
if(!status_params) {
|
||||
return {};
|
||||
}
|
||||
|
||||
if(!status_params->bIsCapturePossible) {
|
||||
BOOST_LOG(error) << "NVidia driver doesn't support NvFBC screencasting"sv;
|
||||
}
|
||||
|
||||
BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv;
|
||||
BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h;
|
||||
|
||||
return display_names;
|
||||
}
|
||||
} // namespace platf::nvfbc
|
||||
248
sunshine/platform/linux/cuda.cu
Normal file
248
sunshine/platform/linux/cuda.cu
Normal file
@@ -0,0 +1,248 @@
|
||||
// #include <algorithm>
|
||||
#include <helper_math.h>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string_view>
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
using namespace std::literals;
|
||||
|
||||
#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
|
||||
#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
|
||||
|
||||
#define CU_CHECK(x, y) \
|
||||
if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1
|
||||
|
||||
#define CU_CHECK_VOID(x, y) \
|
||||
if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return;
|
||||
|
||||
#define CU_CHECK_PTR(x, y) \
|
||||
if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr;
|
||||
|
||||
#define CU_CHECK_IGNORE(x, y) \
|
||||
check((x), SUNSHINE_STRINGVIEW(y ": "))
|
||||
|
||||
using namespace std::literals;
|
||||
|
||||
//////////////////// Special desclarations
|
||||
/**
|
||||
* NVCC segfaults when including <chrono>
|
||||
* Therefore, some declarations need to be added explicitely
|
||||
*/
|
||||
namespace platf {
|
||||
struct img_t {
|
||||
public:
|
||||
std::uint8_t *data {};
|
||||
std::int32_t width {};
|
||||
std::int32_t height {};
|
||||
std::int32_t pixel_pitch {};
|
||||
std::int32_t row_pitch {};
|
||||
|
||||
virtual ~img_t() = default;
|
||||
};
|
||||
} // namespace platf
|
||||
|
||||
namespace video {
|
||||
using __float4 = float[4];
|
||||
using __float3 = float[3];
|
||||
using __float2 = float[2];
|
||||
|
||||
struct __attribute__((__aligned__(16))) color_t {
|
||||
float4 color_vec_y;
|
||||
float4 color_vec_u;
|
||||
float4 color_vec_v;
|
||||
float2 range_y;
|
||||
float2 range_uv;
|
||||
};
|
||||
|
||||
struct __attribute__((__aligned__(16))) color_extern_t {
|
||||
__float4 color_vec_y;
|
||||
__float4 color_vec_u;
|
||||
__float4 color_vec_v;
|
||||
__float2 range_y;
|
||||
__float2 range_uv;
|
||||
};
|
||||
|
||||
extern color_extern_t colors[4];
|
||||
} // namespace video
|
||||
|
||||
//////////////////// End special declarations
|
||||
|
||||
namespace cuda {
|
||||
auto constexpr INVALID_TEXTURE = std::numeric_limits<cudaTextureObject_t>::max();
|
||||
|
||||
template<class T>
|
||||
inline T div_align(T l, T r) {
|
||||
return (l + r - 1) / r;
|
||||
}
|
||||
|
||||
void pass_error(const std::string_view &sv, const char *name, const char *description);
|
||||
inline static int check(cudaError_t result, const std::string_view &sv) {
|
||||
if(result) {
|
||||
auto name = cudaGetErrorName(result);
|
||||
auto description = cudaGetErrorString(result);
|
||||
|
||||
pass_error(sv, name, description);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__ __constant__ video::color_t color;
|
||||
|
||||
|
||||
inline __device__ float3 bgra_to_rgb(uchar4 vec) {
|
||||
return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
|
||||
}
|
||||
|
||||
inline __device__ float2 calcUV(float3 pixel) {
|
||||
float4 vec_u = color.color_vec_u;
|
||||
float4 vec_v = color.color_vec_v;
|
||||
|
||||
float u = dot(pixel, make_float3(vec_u)) + vec_u.w;
|
||||
float v = dot(pixel, make_float3(vec_v)) + vec_v.w;
|
||||
|
||||
u = u * color.range_uv.x + color.range_uv.y;
|
||||
v = (v * color.range_uv.x + color.range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f;
|
||||
|
||||
return make_float2(u, v);
|
||||
}
|
||||
|
||||
inline __device__ float calcY(float3 pixel) {
|
||||
float4 vec_y = color.color_vec_y;
|
||||
|
||||
return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color.range_y.x + color.range_y.y;
|
||||
}
|
||||
|
||||
__global__ void RGBA_to_NV12(
|
||||
cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV,
|
||||
std::uint32_t dstPitchY, std::uint32_t dstPitchUV,
|
||||
std::uint32_t width, std::uint32_t height) {
|
||||
|
||||
int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
|
||||
int idY = (threadIdx.y + blockDim.y * blockIdx.y);
|
||||
|
||||
if(idX >= width) return;
|
||||
if(idY >= height) return;
|
||||
|
||||
dstY = dstY + idX + idY * dstPitchY;
|
||||
dstUV = dstUV + idX + (idY / 2 * dstPitchUV);
|
||||
|
||||
float x = (float)idX / (float)width / 4;
|
||||
float y = (float)idY / (float)height;
|
||||
|
||||
float3 rgb_l = bgra_to_rgb(tex2D<uchar4>(srcImage, x, y));
|
||||
float3 rgb_r = bgra_to_rgb(tex2D<uchar4>(srcImage, x + 0.25f / width, y + 1.0f / height));
|
||||
|
||||
float2 uv = calcUV((rgb_l + rgb_r) * 0.5f);
|
||||
|
||||
dstUV[0] = uv.x;
|
||||
dstUV[1] = uv.y;
|
||||
dstY[0] = calcY(rgb_l);
|
||||
dstY[1] = calcY(rgb_r);
|
||||
}
|
||||
|
||||
sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock)
|
||||
: array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock } {
|
||||
auto format = cudaCreateChannelDesc<uchar4>();
|
||||
|
||||
CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
|
||||
|
||||
cudaResourceDesc res {};
|
||||
res.resType = cudaResourceTypeArray;
|
||||
res.res.array.array = array;
|
||||
|
||||
cudaTextureDesc desc {};
|
||||
|
||||
desc.readMode = cudaReadModeElementType;
|
||||
desc.filterMode = cudaFilterModePoint;
|
||||
desc.normalizedCoords = true;
|
||||
|
||||
std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
|
||||
|
||||
CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture");
|
||||
}
|
||||
|
||||
sws_t::~sws_t() {
|
||||
if(texture != INVALID_TEXTURE) {
|
||||
CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
|
||||
|
||||
texture = INVALID_TEXTURE;
|
||||
}
|
||||
|
||||
if(array) {
|
||||
CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array");
|
||||
|
||||
array = cudaArray_t {};
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height) {
|
||||
cudaDeviceProp props;
|
||||
int device;
|
||||
CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
|
||||
CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
|
||||
|
||||
auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2);
|
||||
|
||||
if(sws->texture == INVALID_TEXTURE) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return sws;
|
||||
}
|
||||
|
||||
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) {
|
||||
int threadsX = width / 2;
|
||||
int threadsY = height;
|
||||
|
||||
dim3 block(threadsPerBlock, threadsPerBlock);
|
||||
dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
|
||||
|
||||
RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, width, height);
|
||||
|
||||
return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
|
||||
}
|
||||
|
||||
void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) {
|
||||
color_range = 1;
|
||||
colorspace = 5;
|
||||
video::color_extern_t *color_p;
|
||||
switch(colorspace) {
|
||||
case 5: // SWS_CS_SMPTE170M
|
||||
color_p = &video::colors[0];
|
||||
break;
|
||||
case 1: // SWS_CS_ITU709
|
||||
color_p = &video::colors[2];
|
||||
break;
|
||||
case 9: // SWS_CS_BT2020
|
||||
default:
|
||||
color_p = &video::colors[0];
|
||||
};
|
||||
|
||||
if(color_range > 1) {
|
||||
// Full range
|
||||
++color_p;
|
||||
}
|
||||
|
||||
auto color_matrix = *(video::color_t*)color_p;
|
||||
color_matrix.color_vec_y.w *= 256.0f;
|
||||
color_matrix.color_vec_u.w *= 256.0f;
|
||||
color_matrix.color_vec_v.w *= 256.0f;
|
||||
|
||||
color_matrix.range_y.y *= 256.0f;
|
||||
color_matrix.range_uv.y *= 256.0f;
|
||||
|
||||
static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch");
|
||||
|
||||
CU_CHECK_IGNORE(cudaMemcpyToSymbol(color, &color_matrix, sizeof(video::color_t)), "Couldn't copy color matrix to cuda");
|
||||
}
|
||||
|
||||
int sws_t::load_ram(platf::img_t &img) {
|
||||
return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array");
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
@@ -1,6 +1,8 @@
|
||||
#ifndef SUNSHINE_PLATFORM_CUDA_H
|
||||
#define SUNSHINE_PLATFORM_CUDA_H
|
||||
|
||||
#ifndef __NVCC__
|
||||
|
||||
#include "sunshine/platform/common.h"
|
||||
#include "x11grab.h"
|
||||
|
||||
@@ -9,4 +11,48 @@ std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x
|
||||
int init();
|
||||
} // namespace cuda
|
||||
|
||||
#else
|
||||
namespace platf {
|
||||
class img_t;
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef struct cudaArray *cudaArray_t;
|
||||
|
||||
#if !defined(__CUDACC__)
|
||||
typedef unsigned long long cudaTextureObject_t;
|
||||
#else /* defined(__CUDACC__) */
|
||||
typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
|
||||
#endif /* !defined(__CUDACC__) */
|
||||
|
||||
namespace cuda {
|
||||
class sws_t {
|
||||
public:
|
||||
~sws_t();
|
||||
sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock);
|
||||
|
||||
/**
|
||||
* in_width, out_width -- The width and height of the captured image in bytes
|
||||
* out_width, out_height -- the width and height of the NV12 image in pixels
|
||||
*
|
||||
* cuda_device -- pointer to the cuda device
|
||||
*/
|
||||
static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height);
|
||||
|
||||
// Converts loaded image into a CUDevicePtr
|
||||
int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV);
|
||||
|
||||
void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
|
||||
|
||||
int load_ram(platf::img_t &img);
|
||||
|
||||
cudaArray_t array;
|
||||
cudaTextureObject_t texture;
|
||||
|
||||
int width, height;
|
||||
|
||||
int threadsPerBlock;
|
||||
};
|
||||
} // namespace cuda
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user