Skip to content

Commit 486b249

Browse files
authored
Add CUDA synchronization points (#470)
Add cuStreamSynchronize calls before kernel launches and before reading buffers/textures. This fixes issues with texture writes not being visible in subsequent launches. CUDA driver API does less automatic synchronization compared to CUDA runtime API. This needs more investigation and switching to a finer grained synchronization scheme.
1 parent 5ae8ab8 commit 486b249

File tree

2 files changed

+17
-5
lines changed

2 files changed

+17
-5
lines changed

src/cuda/cuda-command.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -459,10 +459,9 @@ void CommandExecutor::cmdDispatchCompute(const commands::DispatchCompute& cmd)
459459
);
460460
}
461461
}
462-
//
462+
463463
// The argument data for the entry-point parameters are already
464464
// stored in host memory, as expected by cuLaunchKernel.
465-
//
466465
SLANG_RHI_ASSERT(entryPointData.size >= computePipeline->m_paramBufferSize);
467466
void* extraOptions[] = {
468467
CU_LAUNCH_PARAM_BUFFER_POINTER,
@@ -472,9 +471,15 @@ void CommandExecutor::cmdDispatchCompute(const commands::DispatchCompute& cmd)
472471
CU_LAUNCH_PARAM_END,
473472
};
474473

475-
// Once we have all the necessary data extracted and/or
476-
// set up, we can launch the kernel and see what happens.
477-
//
474+
// This a big hammer! There are various scenarios where we have to synchronize the stream
475+
// between kernel launches. One example is accessing texture memory: writes to texture memory (CUDA surface)
476+
// are not guaranteed to be visible to the next launch reading from it (CUDA texture) without synchronization.
477+
// We need to investigate if finer grained synchronization makes sense here. But this requires
478+
// tracking texture states similarly to how it is done in D3D12/Vulkan.
479+
// Note: This *does not* block the host unless the context is set to blocking mode.
480+
cuStreamSynchronize(m_stream);
481+
482+
// Once we have all the necessary data extracted and/or set up, we can launch the kernel.
478483
SLANG_CUDA_ASSERT_ON_FAIL(cuLaunchKernel(
479484
computePipeline->m_function,
480485
cmd.x,
@@ -541,6 +546,9 @@ void CommandExecutor::cmdDispatchRays(const commands::DispatchRays& cmd)
541546
OptixShaderBindingTable sbt = m_shaderTableInstance->sbt;
542547
sbt.raygenRecord += cmd.rayGenShaderIndex * m_shaderTableInstance->raygenRecordSize;
543548

549+
// This is a big hammer! See notes in `cmdDispatchCompute`.
550+
cuStreamSynchronize(m_stream);
551+
544552
SLANG_OPTIX_ASSERT_ON_FAIL(optixLaunch(
545553
m_rayTracingPipeline->m_pipeline,
546554
m_stream,

src/cuda/cuda-device.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,8 @@ Result DeviceImpl::readTexture(
549549
{
550550
SLANG_CUDA_CTX_SCOPE(this);
551551

552+
cuStreamSynchronize(m_queue->m_stream);
553+
552554
auto textureImpl = checked_cast<TextureImpl*>(texture);
553555

554556
CUarray srcArray = textureImpl->m_cudaArray;
@@ -579,6 +581,8 @@ Result DeviceImpl::readBuffer(IBuffer* buffer, size_t offset, size_t size, void*
579581
{
580582
SLANG_CUDA_CTX_SCOPE(this);
581583

584+
cuStreamSynchronize(m_queue->m_stream);
585+
582586
auto bufferImpl = checked_cast<BufferImpl*>(buffer);
583587
if (offset + size > bufferImpl->m_desc.size)
584588
{

0 commit comments

Comments
 (0)