EDIT: BLAS build is now per-frame async

2025-12-08 15:43:06 +09:00
parent f95520dcb1
commit 33fc35ab6c
9 changed files with 416 additions and 68 deletions
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -1185,6 +1185,10 @@ void VulkanEngine::run()
        // Safe to destroy any BLAS queued for deletion now that the previous frame is idle.
        if (_rayManager) { _rayManager->flushPendingDeletes(); }

+        // Progress queued BLAS builds over multiple frames to avoid large
+        // stalls when many meshes require ray tracing structures at once.
+        if (_rayManager) { _rayManager->pump_blas_builds(1); }
+
        // Commit any completed async IBL load now that the GPU is idle.
        if (_iblManager && _pendingIBLRequest.active)
        {
--- a/src/core/raytracing/raytracing.cpp
+++ b/src/core/raytracing/raytracing.cpp
@@ -35,6 +35,8 @@ void RayTracingManager::cleanup()
    VkDevice dv = _device->device();
    // Destroy any deferred BLAS first
    flushPendingDeletes();
+    _blasBuildQueue.clear();
+    _blasPendingMeshes.clear();

    if (_tlas.handle)
    {
@@ -100,21 +102,53 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptr<Mes
 {
    if (!mesh) return {};

-    // If uploads are deferred, ensure any pending mesh buffer uploads are flushed
-    // before building a BLAS that reads from those GPU buffers.
-    if (_resources && _resources->deferred_uploads() && _resources->has_pending_uploads())
-    {
-        fmt::println("[RT] getOrBuildBLAS: flushing pending resource uploads before BLAS build");
-        _resources->process_queued_uploads_immediate();
-    }
+    const MeshAsset* key = mesh.get();

-    if (auto it = _blasByMesh.find(mesh.get()); it != _blasByMesh.end())
+    // If a BLAS is already cached (even an empty sentinel), return it directly.
+    if (auto it = _blasByMesh.find(key); it != _blasByMesh.end())
    {
        fmt::println("[RT] getOrBuildBLAS reuse by mesh mesh='{}' handle={}", mesh->name,
                     static_cast<const void *>(it->second.handle));
        return it->second;
    }

+    // If a build is already queued or in progress for this mesh, do not enqueue
+    // another job; simply report "not ready yet".
+    if (_blasPendingMeshes.find(key) != _blasPendingMeshes.end())
+    {
+        fmt::println("[RT] getOrBuildBLAS pending build mesh='{}'", mesh->name);
+        return {};
+    }
+
+    // If uploads are deferred, ensure any pending mesh buffer uploads are flushed
+    // before queuing a BLAS that will read from those GPU buffers.
+    if (_resources && _resources->deferred_uploads() && _resources->has_pending_uploads())
+    {
+        fmt::println("[RT] getOrBuildBLAS: flushing pending resource uploads before queuing BLAS build");
+        _resources->process_queued_uploads_immediate();
+    }
+
+    fmt::println("[RT] getOrBuildBLAS queue build mesh='{}'", mesh->name);
+    _blasPendingMeshes.insert(key);
+    _blasBuildQueue.push_back(PendingBlasBuild{key});
+
+    // BLAS will be built asynchronously by pump_blas_builds(); until then,
+    // callers should treat the empty handle as "not ready yet".
+    return {};
+}
+
+AccelStructureHandle RayTracingManager::build_blas_for_mesh(const MeshAsset *mesh)
+{
+    if (!mesh || !_resources || !_device) return {};
+
+    // If uploads are deferred, ensure any pending mesh buffer uploads are flushed
+    // before building a BLAS that reads from those GPU buffers.
+    if (_resources->deferred_uploads() && _resources->has_pending_uploads())
+    {
+        fmt::println("[RT] build_blas_for_mesh: flushing pending resource uploads before BLAS build");
+        _resources->process_queued_uploads_immediate();
+    }
+
    // Build BLAS with one geometry per surface (skip empty primitives)
    std::vector<VkAccelerationStructureGeometryKHR> geoms;
    std::vector<VkAccelerationStructureBuildRangeInfoKHR> ranges;
@@ -126,7 +160,7 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptr<Mes
    const uint32_t vcount = mesh->meshBuffers.vertexCount;
    VkBuffer vb = mesh->meshBuffers.vertexBuffer.buffer;

-    fmt::println("[RT] getOrBuildBLAS build mesh='{}' surfaces={} vcount={}", mesh->name,
+    fmt::println("[RT] build_blas_for_mesh mesh='{}' surfaces={} vcount={}", mesh->name,
                 mesh->surfaces.size(), vcount);

    for (const auto &s: mesh->surfaces)
@@ -162,9 +196,11 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptr<Mes
        ranges.push_back(r);
    }

-    // If no valid geometries, skip BLAS build
+    // If no valid geometries, record an empty sentinel to avoid re-queuing.
    if (geoms.empty())
    {
+        fmt::println("[RT] build_blas_for_mesh: mesh='{}' has no primitives; skipping BLAS", mesh->name);
+        _blasByMesh.emplace(mesh, AccelStructureHandle{});
        return {};
    }

@@ -231,10 +267,50 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptr<Mes
    dai.accelerationStructure = blas.handle;
    blas.deviceAddress = _vkGetAccelerationStructureDeviceAddressKHR(_device->device(), &dai);

-    _blasByMesh.emplace(mesh.get(), blas);
+    _blasByMesh.emplace(mesh, blas);
    return blas;
 }

+void RayTracingManager::pump_blas_builds(uint32_t max_builds_per_frame)
+{
+    if (max_builds_per_frame == 0 || _blasBuildQueue.empty())
+    {
+        return;
+    }
+
+    uint32_t built = 0;
+
+    while (built < max_builds_per_frame && !_blasBuildQueue.empty())
+    {
+        PendingBlasBuild job = _blasBuildQueue.front();
+        _blasBuildQueue.pop_front();
+
+        const MeshAsset* mesh = job.mesh;
+        if (mesh)
+        {
+            // Drop the pending flag for this mesh now; if the build ends up
+            // with an empty handle, getOrBuildBLAS will see the cache entry
+            // (including the empty sentinel) and avoid re-queuing.
+            _blasPendingMeshes.erase(mesh);
+
+            // Skip if a BLAS was already created meanwhile.
+            if (_blasByMesh.find(mesh) == _blasByMesh.end())
+            {
+                AccelStructureHandle blas = build_blas_for_mesh(mesh);
+                if (blas.handle)
+                {
+                    ++built;
+                }
+            }
+        }
+        else
+        {
+            // Mesh pointer is null; just drop the pending flag.
+            _blasPendingMeshes.erase(mesh);
+        }
+    }
+}
+
 void RayTracingManager::ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize /*requiredScratch*/, DeletionQueue& dq)
 {
    // Recreate TLAS storage if size grows. Defer destruction to the frame DQ to
@@ -296,7 +372,10 @@ VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const Dra
            }
            else
            {
-                // Try to build on the fly if the mesh is still alive (non-owning shared_ptr wrapper).
+                // Queue an async BLAS build if the mesh is still alive
+                // (non-owning shared_ptr wrapper). The BLAS will be built
+                // over subsequent frames by pump_blas_builds(); until then,
+                // this instance will be skipped.
                std::shared_ptr<MeshAsset> nonOwning(const_cast<MeshAsset *>(r.sourceMesh), [](MeshAsset *) {});
                blas = getOrBuildBLAS(nonOwning);
            }
@@ -423,6 +502,24 @@ void RayTracingManager::removeBLASForBuffer(VkBuffer vertexBuffer)
 {
    if (!vertexBuffer) return;

+    // Drop any queued builds referencing this vertex buffer.
+    if (!_blasBuildQueue.empty())
+    {
+        for (auto itQ = _blasBuildQueue.begin(); itQ != _blasBuildQueue.end(); )
+        {
+            const MeshAsset* mesh = itQ->mesh;
+            if (mesh && mesh->meshBuffers.vertexBuffer.buffer == vertexBuffer)
+            {
+                _blasPendingMeshes.erase(mesh);
+                itQ = _blasBuildQueue.erase(itQ);
+            }
+            else
+            {
+                ++itQ;
+            }
+        }
+    }
+
    // Find any mesh whose vertex buffer matches and evict its BLAS.
    for (auto it = _blasByMesh.begin(); it != _blasByMesh.end(); )
    {
@@ -443,6 +540,24 @@ void RayTracingManager::removeBLASForBuffer(VkBuffer vertexBuffer)
 void RayTracingManager::removeBLASForMesh(const MeshAsset *mesh)
 {
    if (!mesh) return;
+
+    // Drop any queued builds for this mesh.
+    if (!_blasBuildQueue.empty())
+    {
+        for (auto itQ = _blasBuildQueue.begin(); itQ != _blasBuildQueue.end(); )
+        {
+            if (itQ->mesh == mesh)
+            {
+                itQ = _blasBuildQueue.erase(itQ);
+            }
+            else
+            {
+                ++itQ;
+            }
+        }
+    }
+    _blasPendingMeshes.erase(mesh);
+
    auto it = _blasByMesh.find(mesh);
    if (it == _blasByMesh.end()) return;

--- a/src/core/raytracing/raytracing.h
+++ b/src/core/raytracing/raytracing.h
@@ -1,20 +1,22 @@
 #pragma once
- #include <core/types.h>
- #include <unordered_map>
- #include <vector>
- #include <memory>
- 
- class DeviceManager;
- class ResourceManager;
- struct DrawContext;
- struct MeshAsset;
- 
- struct AccelStructureHandle {
-     VkAccelerationStructureKHR handle{VK_NULL_HANDLE};
-     AllocatedBuffer storage{}; // buffer that backs the AS
-     VkDeviceAddress deviceAddress{0};
- };
- 
+#include <core/types.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <deque>
+#include <memory>
+
+class DeviceManager;
+class ResourceManager;
+struct DrawContext;
+struct MeshAsset;
+
+struct AccelStructureHandle {
+    VkAccelerationStructureKHR handle{VK_NULL_HANDLE};
+    AllocatedBuffer storage{}; // buffer that backs the AS
+    VkDeviceAddress deviceAddress{0};
+};
+
 // Ray tracing helper that caches BLAS per mesh and rebuilds TLAS per frame
 // for hybrid/full ray query shadows. See docs/RayTracing.md.
 class RayTracingManager {
@@ -22,8 +24,16 @@ public:
    void init(DeviceManager* dev, ResourceManager* res);
    void cleanup();
 
-     // Build (or get) BLAS for a mesh. Safe to call multiple times.
-     AccelStructureHandle getOrBuildBLAS(const std::shared_ptr<MeshAsset>& mesh);
+    // Queue a BLAS build for a mesh (if not already built or queued) and
+    // return the cached handle when available. Safe to call multiple times.
+    // When builds are pending, this may return an empty handle; callers
+    // should treat that as "BLAS not ready yet" and skip ray instances.
+    AccelStructureHandle getOrBuildBLAS(const std::shared_ptr<MeshAsset>& mesh);
+
+    // Progress asynchronous BLAS builds. Call once per frame after waiting
+    // for the previous frame's GPU fence. max_builds_per_frame controls how
+    // many BLAS are built in this pump to spread work over multiple frames.
+    void pump_blas_builds(uint32_t max_builds_per_frame = 1);
 
    // Rebuild TLAS from current draw context; returns TLAS handle (or null if unavailable)
    // Destruction of previous TLAS resources is deferred via the provided frame deletion queue
@@ -41,12 +51,12 @@ public:
    void removeBLASForMesh(const MeshAsset *mesh);
 
 private:
-     // function pointers (resolved on init)
-     PFN_vkCreateAccelerationStructureKHR            _vkCreateAccelerationStructureKHR{};
-     PFN_vkDestroyAccelerationStructureKHR           _vkDestroyAccelerationStructureKHR{};
-     PFN_vkGetAccelerationStructureBuildSizesKHR     _vkGetAccelerationStructureBuildSizesKHR{};
-     PFN_vkCmdBuildAccelerationStructuresKHR         _vkCmdBuildAccelerationStructuresKHR{};
-     PFN_vkGetAccelerationStructureDeviceAddressKHR  _vkGetAccelerationStructureDeviceAddressKHR{};
+    // function pointers (resolved on init)
+    PFN_vkCreateAccelerationStructureKHR            _vkCreateAccelerationStructureKHR{};
+    PFN_vkDestroyAccelerationStructureKHR           _vkDestroyAccelerationStructureKHR{};
+    PFN_vkGetAccelerationStructureBuildSizesKHR     _vkGetAccelerationStructureBuildSizesKHR{};
+    PFN_vkCmdBuildAccelerationStructuresKHR         _vkCmdBuildAccelerationStructuresKHR{};
+    PFN_vkGetAccelerationStructureDeviceAddressKHR  _vkGetAccelerationStructureDeviceAddressKHR{};
 
    DeviceManager* _device{nullptr};
    ResourceManager* _resources{nullptr};
@@ -55,6 +65,16 @@ private:
    // when a mesh is destroyed or its GPU buffers are freed, the owning code
    // must call removeBLASForMesh/removeBLASForBuffer to drop the cached BLAS.
    std::unordered_map<const MeshAsset*, AccelStructureHandle> _blasByMesh;
+
+    struct PendingBlasBuild
+    {
+        const MeshAsset* mesh{nullptr};
+    };
+
+    // Queue of BLAS builds to execute over multiple frames.
+    std::deque<PendingBlasBuild> _blasBuildQueue;
+    // Tracks meshes that have a queued or in-progress BLAS build.
+    std::unordered_set<const MeshAsset*> _blasPendingMeshes;
 
    // TLAS + scratch / instance buffer (rebuilt per frame)
    AccelStructureHandle _tlas{};
@@ -68,5 +88,6 @@ private:
    VkDeviceSize _minScratchAlignment{256};
 
    void ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize requiredScratch, DeletionQueue& frameDQ);
+    AccelStructureHandle build_blas_for_mesh(const MeshAsset* mesh);
 };