Devsh-Graphics-Programming · devshgraphicsprogramming · May 13, 2025 · May 7, 2025 · May 7, 2025 · May 7, 2025
diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
@@ -638,6 +638,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 				// I don't do an actual union because the preceeding members don't play nicely with alignment of `core::matrix3x4SIMD` and Vulkan requires this struct to be packed
 				SRTMotionInstance<blas_ref_t> largestUnionMember = {};
 				static_assert(alignof(SRTMotionInstance<blas_ref_t>)==8ull);
+
+			public:
+				constexpr static inline size_t LargestUnionMemberSize = sizeof(largestUnionMember);
 		};
 		using DevicePolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::device_op_ref_t>;
 		using HostPolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::host_op_ref_t>;
@@ -664,6 +667,8 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 
 		//
 		using build_ver_t = uint32_t;
+		//
+		inline build_ver_t getPendingBuildVer() const {return m_pendingBuildVer;}
 		// this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission
 		inline build_ver_t registerNextBuildVer()
 		{

diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
@@ -900,6 +900,9 @@ class CAssetConverter : public core::IReferenceCounted
 			IGPUPipelineCache* pipelineCache = nullptr;
 			// optional, defaults to the device
 			IDeviceMemoryAllocator* allocator = nullptr;
+			// optional, defaults to worst case (Apple Silicon page size)
+			uint32_t scratchForDeviceASBuildMinAllocSize = 1<<14;
+			uint32_t scratchForHostASBuildMinAllocSize = 1<<14;
         };
 		// Split off from inputs because only assets that build on IPreHashed need uploading
 		struct SConvertParams
@@ -970,7 +973,14 @@ class CAssetConverter : public core::IReferenceCounted
 
 			public:
 				template<asset::Asset AssetType>
-				using staging_cache_t = core::unordered_map<typename asset_traits<AssetType>::video_t*,typename CCache<AssetType>::key_t>;
+				struct staging_cache_key
+				{
+					core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t> gpuRef;
+					typename CCache<AssetType>::key_t cacheKey;
+				};
+				// it may seem weird storing both a smart pointer and a raw pointer, but the reason is to be able to drop a refcount while not loosing the key for lookup
+				template<asset::Asset AssetType>
+				using staging_cache_t = core::unordered_map<const typename asset_traits<AssetType>::video_t*,staging_cache_key<AssetType>>;
 
 				inline SReserveResult(SReserveResult&&) = default;
 				inline SReserveResult(const SReserveResult&) = delete;
@@ -1000,7 +1010,12 @@ class CAssetConverter : public core::IReferenceCounted
 					assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]);
 					return m_maxASBuildScratchSize[forHostOps];
 				}
-// TODO: `getMinCompactedASAllocatorSpace`
+				// We do all compactions on the Device for simplicity
+				inline uint64_t getMinCompactedASAllocatorSpace() const
+				{
+					assert(m_compactedASMaxMemory == 0 || willDeviceASBuild() || willHostASBuild());
+					return m_compactedASMaxMemory;
+				}
 				// tells you if you need to provide a valid `SConvertParams::scratchForDeviceASBuild`
 				inline bool willDeviceASBuild() const {return getMinASBuildScratchSize(false)>0;}
 				// tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild`
@@ -1013,8 +1028,7 @@ class CAssetConverter : public core::IReferenceCounted
 				// tells you if you need to provide a valid `SConvertParams::compactedASAllocator`
 				inline bool willCompactAS() const
 				{
-					assert(!m_willCompactSomeAS || willDeviceASBuild() || willHostASBuild());
-					return m_willCompactSomeAS;
+					return getMinCompactedASAllocatorSpace()!=0;
 				}
 
 				//
@@ -1057,21 +1071,10 @@ class CAssetConverter : public core::IReferenceCounted
 					return enqueueSuccess;
 				}
 
-				// public only because `GetDependantVisit<ICPUDescriptorSet>` needs it
-				struct SDeferredTLASWrite
-				{
-					inline bool operator==(const SDeferredTLASWrite& other) const
-					{
-						return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement;
-					}
-
-					IGPUDescriptorSet* dstSet;
-					uint32_t binding;
-					uint32_t arrayElement;
-					core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> tlas;
-				};
 			private:
 				friend class CAssetConverter;
+				// internal classes
+				template<asset::Asset AssetType> friend class GetDependantVisit;
 
 				inline SReserveResult() = default;
 
@@ -1087,70 +1090,70 @@ class CAssetConverter : public core::IReferenceCounted
 
 				// we don't insert into the writeCache until conversions are successful
 				core::tuple_transform_t<staging_cache_t,supported_asset_types> m_stagingCaches;
+
 				// need a more explicit list of GPU objects that need device-assisted conversion
-				template<asset::Asset AssetType>
-				struct SConversionRequestBase
-				{
-					// canonical asset (the one that provides content)
-					core::smart_refctd_ptr<const AssetType> canonical;
-					// gpu object to transfer canonical's data to or build it from
-					asset_traits<AssetType>::video_t* gpuObj;
-				};
-				using SConvReqBuffer = SConversionRequestBase<asset::ICPUBuffer>;
-				core::vector<SConvReqBuffer> m_bufferConversions;
-				struct SConvReqImage : SConversionRequestBase<asset::ICPUImage>
+				core::unordered_map<IGPUBuffer*,core::smart_refctd_ptr<const asset::ICPUBuffer>> m_bufferConversions;
+				struct SConvReqImage
 				{
+					core::smart_refctd_ptr<const asset::ICPUImage> canonical = nullptr;
 					uint16_t recomputeMips = 0;
 				};
-				core::vector<SConvReqImage> m_imageConversions;
+				core::unordered_map<IGPUImage*,SConvReqImage> m_imageConversions;
 				template<typename CPUAccelerationStructure>
-				struct SConvReqAccelerationStructure : SConversionRequestBase<CPUAccelerationStructure>
+				struct SConvReqAccelerationStructure
 				{
-					constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1;
-					inline bool compact() const {return compactedASWriteOffset!=WontCompact;}
-
 					using build_f = typename asset_traits<CPUAccelerationStructure>::video_t::BUILD_FLAGS;
 					inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
 					inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
 
-
-					uint64_t scratchSize;
-					uint64_t compactedASWriteOffset : 48 = WontCompact;
-					uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
+					core::smart_refctd_ptr<const CPUAccelerationStructure> canonical = nullptr;
+					uint64_t scratchSize : 47 = 0;
+					uint64_t buildFlags : 16 = 0;
+					uint64_t compact : 1;
+					// scratch + input size also accounting for worst case padding due to alignment
+					uint64_t buildSize;
+				};
+				using SConvReqBLASMap = core::unordered_map<IGPUBottomLevelAccelerationStructure*,SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>>;
+				SConvReqBLASMap m_blasConversions[2];
+				struct SConvReqTLAS : SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>
+				{
+					// This tracks non-root BLASes which are needed for a subsequent TLAS build.
+					// Because the copy group ID of the BLAS can only depend on the copy group and pointer of the TLAS and BLAS,
+					// we can be sure that all instances of the same BLAS within a TLAS will have the same copy group ID and use a map instead of a vector for storage
+					// Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes prematurely.
+					using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>;
+					cpu_to_gpu_blas_map_t instanceMap;
 				};
-				using SConvReqBLAS = SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>;
-				core::vector<SConvReqBLAS> m_blasConversions[2];
-				using SConvReqTLAS = SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>;
-				core::vector<SConvReqTLAS> m_tlasConversions[2];
+				using SConvReqTLASMap = core::unordered_map<IGPUTopLevelAccelerationStructure*,SConvReqTLAS>;
+				SConvReqTLASMap m_tlasConversions[2];
 
-				// 0 for device builds, 1 for host builds
+				// array index 0 for device builds, 1 for host builds
 				uint64_t m_minASBuildScratchSize[2] = {0,0};
 				uint64_t m_maxASBuildScratchSize[2] = {0,0};
-// TODO: make the compaction count the size
-				// We do all compactions on the Device for simplicity
-				uint8_t m_willCompactSomeAS : 1 = false;
-				// This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early.
-				struct BLASUsedInTLASBuild
+				uint64_t m_compactedASMaxMemory = 0;
+				//
+				struct SDeferredTLASWrite
 				{
-					// This is the BLAS meant to be used for the instance, note that compaction of a BLAS overwrites the initial values at the end of `reserve`
-					core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
-					uint64_t buildDuringConvertCall : 1 = false;
-					// internal micro-refcount which lets us know when we should remove the entry from the map below
-					uint64_t remainingUsages : 63 = 0;
+					inline bool operator==(const SDeferredTLASWrite& other) const
+					{
+						return dstSet==other.dstSet && storageOffset.data==other.storageOffset.data;
+					}
+
+					IGPUDescriptorSet* dstSet;
+					// binding and array element rolled up into one
+					IGPUDescriptorSetLayout::CBindingRedirect::storage_offset_t storageOffset;
 				};
-				using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,BLASUsedInTLASBuild>;
-				cpu_to_gpu_blas_map_t m_blasBuildMap;
 				struct SDeferredTLASWriteHasher
 				{
 					inline size_t operator()(const SDeferredTLASWrite& write) const
 					{
-						size_t retval = std::bit_cast<size_t>(write.dstSet);
-						core::hash_combine(retval,write.binding);
-						core::hash_combine(retval,write.arrayElement);
+						size_t retval = write.storageOffset.data;
+						core::hash_combine(retval,write.dstSet);
 						return retval;
 					}
 				};
-				core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher> m_deferredTLASDescriptorWrites;
+				using compacted_tlas_rewrite_set_t = core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher>;
+				compacted_tlas_rewrite_set_t m_potentialTLASRewrites;
 
 				//
 				core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;

diff --git a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h
@@ -11,128 +11,6 @@
 #include "nbl/video/ILogicalDevice.h"
 
 #if 0
-auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure** _begin, const asset::ICPUAccelerationStructure** _end, SParams& _params) -> created_gpu_object_array<asset::ICPUAccelerationStructure>
-{
-	const size_t assetCount = std::distance(_begin, _end);
-	auto res = core::make_refctd_dynamic_array<created_gpu_object_array<asset::ICPUAccelerationStructure> >(assetCount);
-	auto toCreateAndBuild = std::vector<const asset::ICPUAccelerationStructure*>();
-    auto buildRangeInfos = std::vector<IGPUAccelerationStructure::BuildRangeInfo*>();
-    toCreateAndBuild.reserve(assetCount);
-    buildRangeInfos.reserve(assetCount);
-    // Lambda function: creates the acceleration structure and It's buffer
-    auto allocateBufferAndCreateAccelerationStructure = [&](size_t asSize, const asset::ICPUAccelerationStructure* cpuas)
-    {
-        // Create buffer with cpuas->getAccelerationStructureSize
-        IGPUBuffer::SCreationParams gpuBufParams = {};
-        gpuBufParams.size = asSize;
-        gpuBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-        auto gpubuf = _params.device->createBuffer(std::move(gpuBufParams));
-        auto mreqs = gpubuf->getMemoryReqs();
-        mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-        auto gpubufMem = _params.device->allocate(mreqs, gpubuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-        assert(gpubufMem.isValid());
-
-        // Create GPUAccelerationStructure with that buffer
-        IGPUAccelerationStructure::SCreationParams creatationParams = {};
-        creatationParams.bufferRange.buffer = gpubuf;
-        creatationParams.bufferRange.offset = 0;
-        creatationParams.bufferRange.size = asSize;
-        creatationParams.flags = cpuas->getCreationParameters().flags;
-        creatationParams.type = cpuas->getCreationParameters().type;
-        return _params.device->createAccelerationStructure(std::move(creatationParams));
-    };
-
-    for (ptrdiff_t i = 0u; i < assetCount; ++i)
-    {
-        const asset::ICPUAccelerationStructure* cpuas = _begin[i];
-
-        if(cpuas->hasBuildInfo())
-        {
-            // Add to toBuild vector of ICPUAccelerationStructure
-            toCreateAndBuild.push_back(cpuas);
-            buildRangeInfos.push_back(const_cast<IGPUAccelerationStructure::BuildRangeInfo*>(cpuas->getBuildRanges().begin()));
-        }
-        else if(cpuas->getAccelerationStructureSize() > 0)
-        {
-            res->operator[](i) = allocateBufferAndCreateAccelerationStructure(cpuas->getAccelerationStructureSize(), cpuas);
-        }
-    }
-
-    if(toCreateAndBuild.empty() == false)
-    {
-        bool hostBuildCommands = false; // get from SFeatures
-        if(hostBuildCommands)
-        {
-            _NBL_TODO();
-        }
-        else
-        {
-            core::vector<const asset::ICPUBuffer*> cpuBufferDeps;
-            constexpr uint32_t MaxGeometryPerBuildInfo = 16;
-            constexpr uint32_t MaxBuffersPerGeometry = 3; // TrianglesData ->  vertex+index+transformation
-            cpuBufferDeps.reserve(assetCount * MaxGeometryPerBuildInfo * MaxBuffersPerGeometry);
-
-            // Get CPUBuffer Dependencies
-            for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i)
-            {
-                const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i];
-
-                auto buildInfo = cpuas->getBuildInfo();
-                assert(buildInfo != nullptr);
-
-                auto geoms = buildInfo->getGeometries().begin();
-                auto geomsCount = buildInfo->getGeometries().size();
-                if(geomsCount == 0)
-                {
-                    assert(false);
-                    continue;
-                }
-
-                for(uint32_t g = 0; g < geomsCount; ++g) 
-                {
-                    const auto& geom = geoms[g];
-                    if(geom.type == asset::IAccelerationStructure::EGT_TRIANGLES)
-                    {
-                        if(geom.data.triangles.indexData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.indexData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                        if(geom.data.triangles.vertexData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.vertexData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                        if(geom.data.triangles.transformData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.transformData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                    else if(geom.type == asset::IAccelerationStructure::EGT_AABBS)
-                    {
-                        if(geom.data.aabbs.data.isValid())
-                        {
-                            auto cpuBuf = geom.data.aabbs.data.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                    else if(geom.type == asset::IAccelerationStructure::EGT_INSTANCES)
-                    {
-                        if(geom.data.instances.data.isValid())
-                        {
-                            auto cpuBuf = geom.data.instances.data.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                }
-            }
-
             // Convert CPUBuffer Deps to GPUBuffers
             core::vector<size_t> redirs = eliminateDuplicatesAndGenRedirs(cpuBufferDeps);
             auto gpuBufs = getGPUObjectsFromAssets<asset::ICPUBuffer>(cpuBufferDeps.data(), cpuBufferDeps.data()+cpuBufferDeps.size(), _params);
@@ -285,47 +163,6 @@ auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure
                 auto & gpuBuildInfo = buildGeomInfos[i];
                 gpuBuildInfo.scratchAddr.buffer = gpuScratchBuf;
             }
-
-            // Record CommandBuffer for Building (We have Completed buildInfos + buildRanges for each CPUAS)
-            auto & fence = _params.fences[EQU_COMPUTE];
-            fence = _params.device->createFence(static_cast<IGPUFence::E_CREATE_FLAGS>(0));
-            core::smart_refctd_ptr<IGPUCommandBuffer> cmdbuf = _params.perQueue[EQU_COMPUTE].cmdbuf;
-
-            IQueue::SSubmitInfo submit;
-            {
-                submit.commandBufferCount = 1u;
-                submit.commandBuffers = &cmdbuf.get();
-                submit.waitSemaphoreCount = 0u;
-                submit.pWaitDstStageMask = nullptr;
-                submit.pWaitSemaphores = nullptr;
-                uint32_t waitSemaphoreCount = 0u;
-            }
-
-            assert(cmdbuf->getState() == IGPUCommandBuffer::STATE::RECORDING);
-            cmdbuf->buildAccelerationStructures({buildGeomInfos.data(),buildGeomInfos.data()+buildGeomInfos.size()},buildRangeInfos.data());
-            cmdbuf->end();
-
-            // TODO for future to make this function more sophisticated: Compaction, MemoryLimit for Build
-
-            core::smart_refctd_ptr<IGPUSemaphore> sem;
-
-            if (_params.perQueue[EQU_COMPUTE].semaphore)
-                sem = _params.device->createSemaphore();
-
-            auto* sem_ptr = sem.get();
-            auto* fence_ptr = fence.get();
-
-            submit.signalSemaphoreCount = sem_ptr?1u:0u;
-            submit.pSignalSemaphores = sem_ptr?&sem_ptr:nullptr;
-
-            _params.perQueue[EQU_COMPUTE].queue->submit(1u, &submit, fence_ptr);
-            if (_params.perQueue[EQU_COMPUTE].semaphore)
-                _params.perQueue[EQU_COMPUTE].semaphore[0] = std::move(sem);
-        }
-    }
-
-    return res;
-}
 #endif
 
 #endif