Add CUDA synchronization points (#470)

skallweitNV · web-flow · commit 486b24926994 · 2025-08-05T14:12:44.000+02:00
Add cuStreamSynchronize calls before kernel launches and before reading buffers/textures. This fixes issues with texture writes not being visible in subsequent launches. CUDA driver API does less automatic synchronization compared to CUDA runtime API. This needs more investigation and switching to a finer grained synchronization scheme.
diff --git a/src/cuda/cuda-command.cpp b/src/cuda/cuda-command.cpp
@@ -459,10 +459,9 @@ void CommandExecutor::cmdDispatchCompute(const commands::DispatchCompute& cmd)
             );
         }
     }
-    //
+
     // The argument data for the entry-point parameters are already
     // stored in host memory, as expected by cuLaunchKernel.
-    //
     SLANG_RHI_ASSERT(entryPointData.size >= computePipeline->m_paramBufferSize);
     void* extraOptions[] = {
         CU_LAUNCH_PARAM_BUFFER_POINTER,
@@ -472,9 +471,15 @@ void CommandExecutor::cmdDispatchCompute(const commands::DispatchCompute& cmd)
         CU_LAUNCH_PARAM_END,
     };
 
-    // Once we have all the necessary data extracted and/or
-    // set up, we can launch the kernel and see what happens.
-    //
+    // This a big hammer! There are various scenarios where we have to synchronize the stream
+    // between kernel launches. One example is accessing texture memory: writes to texture memory (CUDA surface)
+    // are not guaranteed to be visible to the next launch reading from it (CUDA texture) without synchronization.
+    // We need to investigate if finer grained synchronization makes sense here. But this requires
+    // tracking texture states similarly to how it is done in D3D12/Vulkan.
+    // Note: This *does not* block the host unless the context is set to blocking mode.
+    cuStreamSynchronize(m_stream);
+
+    // Once we have all the necessary data extracted and/or set up, we can launch the kernel.
     SLANG_CUDA_ASSERT_ON_FAIL(cuLaunchKernel(
         computePipeline->m_function,
         cmd.x,
@@ -541,6 +546,9 @@ void CommandExecutor::cmdDispatchRays(const commands::DispatchRays& cmd)
     OptixShaderBindingTable sbt = m_shaderTableInstance->sbt;
     sbt.raygenRecord += cmd.rayGenShaderIndex * m_shaderTableInstance->raygenRecordSize;
 
+    // This is a big hammer! See notes in `cmdDispatchCompute`.
+    cuStreamSynchronize(m_stream);
+
     SLANG_OPTIX_ASSERT_ON_FAIL(optixLaunch(
         m_rayTracingPipeline->m_pipeline,
         m_stream,
diff --git a/src/cuda/cuda-device.cpp b/src/cuda/cuda-device.cpp
@@ -549,6 +549,8 @@ Result DeviceImpl::readTexture(
 {
     SLANG_CUDA_CTX_SCOPE(this);
 
+    cuStreamSynchronize(m_queue->m_stream);
+
     auto textureImpl = checked_cast<TextureImpl*>(texture);
 
     CUarray srcArray = textureImpl->m_cudaArray;
@@ -579,6 +581,8 @@ Result DeviceImpl::readBuffer(IBuffer* buffer, size_t offset, size_t size, void*
 {
     SLANG_CUDA_CTX_SCOPE(this);
 
+    cuStreamSynchronize(m_queue->m_stream);
+
     auto bufferImpl = checked_cast<BufferImpl*>(buffer);
     if (offset + size > bufferImpl->m_desc.size)
     {

Original file line number	Diff line number	Diff line change
`@@ -549,6 +549,8 @@ Result DeviceImpl::readTexture(`
`549`	`549`	`{`
`550`	`550`	`SLANG_CUDA_CTX_SCOPE(this);`
`551`	`551`
	`552`	`+ cuStreamSynchronize(m_queue->m_stream);`
	`553`	`+`
`552`	`554`	`auto textureImpl = checked_cast<TextureImpl*>(texture);`
`553`	`555`
`554`	`556`	`CUarray srcArray = textureImpl->m_cudaArray;`
`@@ -579,6 +581,8 @@ Result DeviceImpl::readBuffer(IBuffer* buffer, size_t offset, size_t size, void*`
`579`	`581`	`{`
`580`	`582`	`SLANG_CUDA_CTX_SCOPE(this);`
`581`	`583`
	`584`	`+ cuStreamSynchronize(m_queue->m_stream);`
	`585`	`+`
`582`	`586`	`auto bufferImpl = checked_cast<BufferImpl*>(buffer);`
`583`	`587`	`if (offset + size > bufferImpl->m_desc.size)`
`584`	`588`	`{`