Allow cuda kernel to run in parallell

This commit is contained in:
loki-47-6F-64
2021-09-27 17:58:35 +02:00
parent 847d7b6980
commit 4177b02064
4 changed files with 95 additions and 29 deletions

View File

@@ -101,6 +101,23 @@ void freeCudaPtr_t::operator()(void *ptr) {
CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer");
}
void freeCudaStream_t::operator()(cudaStream_t ptr) {
CU_CHECK_IGNORE(cudaStreamDestroy(ptr), "Couldn't free cuda stream");
}
stream_t make_stream(int flags) {
cudaStream_t stream;
if(!flags) {
CU_CHECK_PTR(cudaStreamCreate(&stream), "Couldn't create cuda stream");
}
else {
CU_CHECK_PTR(cudaStreamCreateWithFlags(&stream, flags), "Couldn't create cuda stream with flags");
}
return stream_t { stream };
}
inline __device__ float3 bgra_to_rgb(uchar4 vec) {
return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
}
@@ -260,18 +277,18 @@ std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, i
return std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
}
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
return convert(Y, UV, pitchY, pitchUV, texture, viewport);
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream) {
return convert(Y, UV, pitchY, pitchUV, texture, stream, viewport);
}
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) {
int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport) {
int threadsX = viewport.width / 2;
int threadsY = viewport.height;
dim3 block(threadsPerBlock);
dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
RGBA_to_NV12<<<grid, block>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
RGBA_to_NV12<<<grid, block, 0, stream>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
}