Add standalone NVENC encoder

This commit is contained in:
ns6089
2023-04-25 16:38:37 +03:00
committed by Cameron Gutman
parent 7fe52bc5f8
commit 68fa43a61c
34 changed files with 2124 additions and 642 deletions

View File

@@ -56,12 +56,11 @@ public:
};
} // namespace platf
namespace video {
using __float4 = float[4];
using __float3 = float[3];
using __float2 = float[2];
// End special declarations
struct alignas(16) color_t {
namespace cuda {
struct alignas(16) cuda_color_t {
float4 color_vec_y;
float4 color_vec_u;
float4 color_vec_v;
@@ -69,22 +68,8 @@ struct alignas(16) color_t {
float2 range_uv;
};
struct alignas(16) color_extern_t {
__float4 color_vec_y;
__float4 color_vec_u;
__float4 color_vec_v;
__float2 range_y;
__float2 range_uv;
};
static_assert(sizeof(video::color_t) == sizeof(cuda::cuda_color_t), "color matrix struct mismatch");
static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch");
extern color_t colors[6];
} // namespace video
// End special declarations
namespace cuda {
auto constexpr INVALID_TEXTURE = std::numeric_limits<cudaTextureObject_t>::max();
template<class T>
@@ -144,7 +129,7 @@ inline __device__ float3 bgra_to_rgb(float4 vec) {
return make_float3(vec.z, vec.y, vec.x);
}
inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_matrix) {
inline __device__ float2 calcUV(float3 pixel, const cuda_color_t *const color_matrix) {
float4 vec_u = color_matrix->color_vec_u;
float4 vec_v = color_matrix->color_vec_v;
@@ -157,7 +142,7 @@ inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_
return make_float2(u, v);
}
inline __device__ float calcY(float3 pixel, const video::color_t *const color_matrix) {
inline __device__ float calcY(float3 pixel, const cuda_color_t *const color_matrix) {
float4 vec_y = color_matrix->color_vec_y;
return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color_matrix->range_y.x + color_matrix->range_y.y;
@@ -166,7 +151,7 @@ inline __device__ float calcY(float3 pixel, const video::color_t *const color_ma
__global__ void RGBA_to_NV12(
cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV,
std::uint32_t dstPitchY, std::uint32_t dstPitchUV,
float scale, const viewport_t viewport, const video::color_t *const color_matrix) {
float scale, const viewport_t viewport, const cuda_color_t *const color_matrix) {
int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
int idY = (threadIdx.y + blockDim.y * blockIdx.y) * 2;
@@ -297,7 +282,7 @@ std::optional<sws_t> sws_t::make(int in_width, int in_height, int out_width, int
CU_CHECK_OPT(cudaGetDevice(&device), "Couldn't get cuda device");
CU_CHECK_OPT(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
auto ptr = make_ptr<video::color_t>();
auto ptr = make_ptr<cuda_color_t>();
if(!ptr) {
return std::nullopt;
}
@@ -316,32 +301,13 @@ int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std:
dim3 block(threadsPerBlock);
dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
RGBA_to_NV12<<<grid, block, 0, stream>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
RGBA_to_NV12<<<grid, block, 0, stream>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (cuda_color_t *)color_matrix.get());
return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
}
void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) {
video::color_t *color_p;
switch(colorspace) {
case 5: // SWS_CS_SMPTE170M
color_p = &video::colors[0];
break;
case 1: // SWS_CS_ITU709
color_p = &video::colors[2];
break;
case 9: // SWS_CS_BT2020
color_p = &video::colors[4];
break;
default:
color_p = &video::colors[0];
};
if(color_range > 1) {
// Full range
++color_p;
}
void sws_t::apply_colorspace(const video::sunshine_colorspace_t& colorspace) {
auto color_p = video::color_vectors_from_colorspace(colorspace);
CU_CHECK_IGNORE(cudaMemcpy(color_matrix.get(), color_p, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda");
}