From 33fc35ab6cd7abf4b122b5a1e536a3786ad4cf03 Mon Sep 17 00:00:00 2001 From: hydrogendeuteride Date: Mon, 8 Dec 2025 15:43:06 +0900 Subject: [PATCH] EDIT: BLAS build is now per-frame async --- docs/IBL.md | 113 ++++++++++++++++++----- docs/RayTracing.md | 18 ++-- docs/RenderGraph.md | 4 +- docs/RenderPasses.md | 88 +++++++++++++++++- docs/SHADERS.md | 29 +++++- docs/materials.md | 20 ++++- src/core/engine.cpp | 4 + src/core/raytracing/raytracing.cpp | 139 ++++++++++++++++++++++++++--- src/core/raytracing/raytracing.h | 69 +++++++++----- 9 files changed, 416 insertions(+), 68 deletions(-) diff --git a/docs/IBL.md b/docs/IBL.md index 2900170..ab5d50e 100644 --- a/docs/IBL.md +++ b/docs/IBL.md @@ -5,8 +5,9 @@ Overview - Shaders share a common include, `shaders/ibl_common.glsl`, which defines the IBL bindings for descriptor set 3 and helper functions used by deferred, forward, and background passes. - The engine currently supports: - Specular environment from an equirectangular 2D texture with prefiltered mips (`sampler2D iblSpec2D`). - - Diffuse irradiance from 2nd‑order SH (9 coefficients baked on the CPU). - - A 2D BRDF integration LUT used for the split‑sum approximation. + - Diffuse irradiance from 2nd-order SH (9 coefficients baked on the CPU). + - A 2D BRDF integration LUT used for the split-sum approximation. + - An optional separate background environment texture (`sampler2D iblBackground2D`); when not provided, the system falls back to using the specular environment for background rendering. Data Flow - Init: @@ -18,49 +19,115 @@ Data Flow - Tries `ktxutil::load_ktx2_cubemap` first. If successful, uploads via `ResourceManager::create_image_compressed_layers` with `VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT`. - If cubemap loading fails, falls back to 2D `.ktx2` via `ktxutil::load_ktx2_2d` and `create_image_compressed`. The image is treated as equirectangular with prefiltered mips. - When the specular `.ktx2` is HDR (`R16G16B16A16_SFLOAT` or `R32G32B32A32_SFLOAT`) and 2:1 aspect, `IBLManager` computes 9 SH coefficients on the CPU: - - Integrates the environment over the sphere using real SH basis functions (L2) with solid‑angle weighting. - - Applies Lambert band scaling (A0 = π, A1 = 2π/3, A2 = π/4). + - Integrates the environment over the sphere using real SH basis functions (L2) with solid-angle weighting. + - Applies Lambert band scaling (A0 = pi, A1 = 2pi/3, A2 = pi/4). - Uploads the result as `vec4 sh[9]` in a uniform buffer (`_shBuffer`). - Diffuse: - If `IBLPaths::diffuseCube` is provided and valid, loads it as a cubemap via `load_ktx2_cubemap` + `create_image_compressed_layers`. - Current shaders only use the SH buffer for diffuse; the diffuse cubemap is reserved for future variants. + - Background: + - If `IBLPaths::background2D` is provided and valid, loads it as a 2D equirectangular `.ktx2` via `load_ktx2_2d` + `create_image_compressed`. + - This allows using a separate, potentially higher-resolution or unfiltered environment for the sky background while using a prefiltered version for specular IBL. - BRDF LUT: - Loaded as 2D `.ktx2` via `ktxutil::load_ktx2_2d` and uploaded with `create_image_compressed`. - Fallbacks: - If `diffuseCube` is missing but a specular env exists, `_diff` is aliased to `_spec`. - - `IBLManager::load_async(const IBLPaths&)` + `IBLManager::pump_async()` (runtime path used by the engine): - - `load_async` runs KTX2 file I/O and SH bake on a worker thread and stores a prepared CPU-side description (`PreparedIBLData`). - - `pump_async` is called on the main thread once per frame (after the previous frame is idle) to: - - Destroy old IBL images/SH via `destroy_images_and_sh()`. - - Create new GPU images with `create_image_compressed(_layers)` and upload the SH buffer. - - This avoids stalls in the main/game loop when switching IBL volumes or loading the default environment at startup. + - If `background2D` is missing but a specular env exists, `_background` is aliased to `_spec`. - `IBLManager::unload()` releases GPU images, the SH buffer, and the descriptor set layout. - Descriptor layout: - `IBLManager::ensureLayout()` builds a descriptor set layout (set=3) with: - - binding 0: `COMBINED_IMAGE_SAMPLER` — specular environment (2D equirect). - - binding 1: `COMBINED_IMAGE_SAMPLER` — BRDF LUT 2D. - - binding 2: `UNIFORM_BUFFER` — SH coefficients (`vec4 sh[9]`). + - binding 0: `COMBINED_IMAGE_SAMPLER` - specular environment (2D equirect). + - binding 1: `COMBINED_IMAGE_SAMPLER` - BRDF LUT 2D. + - binding 2: `UNIFORM_BUFFER` - SH coefficients (`vec4 sh[9]`). + - binding 3: `COMBINED_IMAGE_SAMPLER` - background environment (2D equirect, optional). - Passes request this layout from `EngineContext::ibl` and plug it into their pipeline set layouts: - Background: `vk_renderpass_background.cpp` (set 3 used for env background). - Lighting: `vk_renderpass_lighting.cpp` (deferred lighting pass, set 3). - Transparent: `vk_renderpass_transparent.cpp` (forward/transparent materials, set 3). +Asynchronous Loading +- Overview: + - `IBLManager` provides an asynchronous loading path via `load_async()` + `pump_async()` to avoid blocking the main/game loop during IBL environment switches or initial loading. + - Heavy CPU work (KTX2 file I/O, decompression, SH coefficient baking) runs on a dedicated worker thread. + - GPU resource creation (image uploads, buffer allocation) is deferred to the main thread via `pump_async()`. +- API: + - `bool load_async(const IBLPaths &paths)`: + - Queues an asynchronous IBL load job. + - Returns `false` if the job could not be queued (e.g., context not initialized). + - If called while a previous job is still pending, the new request supersedes the old one (the old result is discarded when ready). + - `struct AsyncResult { bool completed; bool success; }`: + - `completed`: `true` when an async job finished since the last `pump_async()` call. + - `success`: `true` when the finished job successfully produced new GPU IBL resources. + - `AsyncResult pump_async()`: + - Must be called on the main thread, typically once per frame after the previous frame's GPU work is idle. + - If a completed async job is pending: + - Destroys old IBL images and SH buffer via `destroy_images_and_sh()`. + - Creates new GPU images with `create_image_compressed(_layers)` and uploads the SH buffer. + - Returns `AsyncResult` indicating whether a job completed and its success status. +- Internal Architecture: + - `IBLManager::init()` spawns a persistent worker thread that waits on a condition variable. + - When `load_async()` is called: + - The request paths and a unique job ID are stored in `AsyncStateData`. + - The worker thread is signaled via condition variable. + - Any previous pending result is invalidated (superseded by the new job ID). + - Worker thread execution: + - Calls `prepare_ibl_cpu()` to load KTX2 files and bake SH coefficients. + - Stores the prepared data (`PreparedIBLData`) in `AsyncStateData`. + - Marks the result as ready with the corresponding job ID. + - If the job ID no longer matches (superseded), the result is discarded. + - Main thread integration (`pump_async()`): + - Checks if a result is ready. + - If ready, calls `commit_prepared()` to create GPU resources from the prepared CPU data. + - Clears the ready flag and returns the result status. +- Thread Safety: + - All shared state in `AsyncStateData` is protected by a mutex. + - The worker thread only reads request data and writes result data. + - The main thread only reads result data and writes request data. + - GPU resource creation is strictly on the main thread. +- Usage Example: + ```cpp + // Queue async IBL load (non-blocking) + iblManager->load_async(IBLPaths{ + .specularCube = "assets/ibl/studio_spec.ktx2", + .brdfLut2D = "assets/ibl/brdf_lut.ktx2", + .background2D = "assets/ibl/studio_bg.ktx2" + }); + + // In main loop, after waiting for previous frame: + auto result = iblManager->pump_async(); + if (result.completed) { + if (result.success) { + // New IBL environment is now active + } else { + // Loading failed, handle error (e.g., keep previous IBL) + } + } + ``` +- Benefits: + - No frame stalls when loading large HDR environment maps. + - Seamless IBL volume transitions (e.g., entering a building with different lighting). + - SH baking (CPU-intensive) happens off the main thread. +- Cleanup: + - `IBLManager::unload()` shuts down the async worker thread (joins) and releases all GPU resources. + - The destructor also calls `shutdown_async()` to ensure clean termination. + Shader Side (`shaders/ibl_common.glsl`) - Bindings: - `layout(set=3, binding=0) uniform sampler2D iblSpec2D;` - `layout(set=3, binding=1) uniform sampler2D iblBRDF;` - `layout(std140, set=3, binding=2) uniform IBL_SH { vec4 sh[9]; } iblSH;` + - `layout(set=3, binding=3) uniform sampler2D iblBackground2D;` - Helpers: - `vec3 sh_eval_irradiance(vec3 n)`: - Evaluates the 9 SH basis functions (L2) at direction `n` using the same real SH basis as the CPU bake. - Multiplies each basis value by the corresponding `iblSH.sh[i].rgb` coefficient and sums the result. - Coefficients are already convolved with the Lambert kernel on the CPU; the function returns diffuse irradiance directly. - `vec2 dir_to_equirect(vec3 d)`: - - Normalizes `d`, computes `(phi, theta)` and returns equirectangular UV in `[0,1]²`. + - Normalizes `d`, computes `(phi, theta)` and returns equirectangular UV in `[0,1]^2`. - Used consistently by background, deferred, and forward pipelines. - `float ibl_lod_from_roughness(float roughness, float levels)`: - - Computes the mip LOD for specular IBL using `roughness² * (levels - 1)`. - - This biases mid‑roughness reflections towards blurrier mips and avoids overly sharp reflections. + - Computes the mip LOD for specular IBL using `roughness^2 * (levels - 1)`. + - This biases mid-roughness reflections towards blurrier mips and avoids overly sharp reflections. Usage in Passes - Deferred lighting (`shaders/deferred_lighting.frag` and `shaders/deferred_lighting_nort.frag`): @@ -84,24 +151,30 @@ Usage in Passes - Same include and IBL logic as deferred, applied after direct lighting. - Uses the same `ibl_lod_from_roughness` helper for LOD selection. - Background (`shaders/background_env.frag`): - - Includes `ibl_common.glsl` and uses `dir_to_equirect(worldDir)` + `textureLod(iblSpec2D, uv, 0.0)` to render the environment at LOD 0. + - Includes `ibl_common.glsl` and uses `dir_to_equirect(worldDir)` + `textureLod(iblBackground2D, uv, 0.0)` to render the environment at LOD 0. + - When a dedicated background texture is provided via `IBLPaths::background2D`, the background pass renders from `iblBackground2D` which may differ from `iblSpec2D`. Authoring IBL Assets - Specular environment: - Preferred: prefiltered HDR cubemap in `.ktx2` (BC6H or `R16G16B16A16_SFLOAT`) with multiple mips. - - Alternative: prefiltered equirectangular 2D `.ktx2` with width = 2 × height and full mip chain. + - Alternative: prefiltered equirectangular 2D `.ktx2` with width = 2 x height and full mip chain. - Make sure the mip chain is generated with a GGX importance sampling tool so the BRDF LUT + mip chain match. - BRDF LUT: - A standard 2D preintegrated GGX LUT (RG), usually stored as `R8G8_UNORM` or BC5. - The LUT is sampled with `(NoV, roughness)` coordinates. - Diffuse: - The engine currently uses SH coefficients baked from the specular equirectangular map. If you provide a separate diffuse cubemap, the CPU SH bake still uses the specular HDR; you can adjust this in `IBLManager` if you want SH to come from a different source. +- Background: + - Optional: equirectangular 2D `.ktx2` used exclusively for the sky background pass. + - Useful when you want a sharper or unfiltered environment for the visible sky while using a prefiltered version for specular reflections. + - If not provided, the system falls back to using `specularCube` for background rendering. Implementation Notes - CPU SH bake: - Implemented in `IBLManager::load` using libktx to access raw HDR pixel data from `.ktx2`. - - Uses a simple nested loop over pixels with solid‑angle weighting and the same SH basis as `sh_eval_irradiance`. + - Uses a simple nested loop over pixels with solid-angle weighting and the same SH basis as `sh_eval_irradiance`. - Fallbacks: - Lighting and transparent passes create small fallback textures so that the IBL descriptor set is always valid, even when no IBL assets are loaded. - - Background pass builds a 1×1×6 black cube as a fallback env. + - Background pass builds a 1x1x6 black cube as a fallback env. + - When `background2D` is not provided, `IBLManager::background()` returns the same image as `specular()`. diff --git a/docs/RayTracing.md b/docs/RayTracing.md index a2938e6..091687f 100644 --- a/docs/RayTracing.md +++ b/docs/RayTracing.md @@ -12,22 +12,27 @@ Optional subsystem that enables hybrid or full ray traced shadows via Ray Query. ### BLAS Build & Cache - `AccelStructureHandle getOrBuildBLAS(const std::shared_ptr& mesh)`: - - One GAS per `MeshAsset`, keyed by vertex buffer `VkBuffer`. + - One GAS per `MeshAsset`, keyed by mesh pointer. - Populated with one triangle geometry per `GeoSurface`. - Built with `VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR` and device-local storage + scratch. - - Cached in `_blasByVB` for reuse across frames. - - Called from `AssetManager::createMesh(...)` and from GLTF loader after mesh upload. + - Cached in `_blasByMesh` for reuse across frames. + - When a BLAS does not exist yet, the mesh is queued for an asynchronous build and an empty handle is returned; callers must treat this as "BLAS not ready" and skip the instance for the current frame (see TLAS section below). + - `pump_blas_builds(max_builds_per_frame)` advances an internal BLAS build queue and is called once per frame from the engine main loop to spread work across multiple frames instead of doing all BLAS builds in a single spike. ### TLAS Rebuild Per Frame - `VkAccelerationStructureKHR buildTLASFromDrawContext(const DrawContext& dc)`: - Iterates `dc.OpaqueSurfaces` and creates one instance per render object. - - Looks up BLAS by `RenderObject::vertexBuffer`; if missing, instance is skipped. + - Looks up BLAS by `RenderObject::sourceMesh` (the `MeshAsset*`); if a BLAS is not cached yet, it calls `getOrBuildBLAS` with a non-owning `shared_ptr` to queue a build and then skips the instance for this frame. - Uploads instances to a CPU→GPU buffer with device address. - Builds TLAS with `immediate_submit` and stores device address for Ray Query. ### Renderer Integration - In `VulkanEngine::draw()` before building passes: - - If RT mode is enabled (`shadowSettings.mode != 0`) and manager exists, TLAS is rebuilt from the latest draw context. + - If RT mode is enabled (`shadowSettings.mode != 0`) or ray-traced SSR is enabled (`enableSSR && reflectionMode != 0`), and the manager exists, TLAS is rebuilt from the latest draw context. + - TLAS only references BLAS that are already built; instances whose meshes are still in the BLAS build queue are skipped until their BLAS completes. +- In `VulkanEngine::run()` at the start of each frame (after waiting for the previous frame fence): + - Calls `RayTracingManager::flushPendingDeletes()` to safely destroy any BLAS scheduled for deferred deletion. + - Calls `RayTracingManager::pump_blas_builds(1)` to build at most one queued BLAS per frame (tunable if you want more or fewer builds per frame). - Lighting pass binds the TLAS at `set=0,binding=1` when available. ### Modes & UI @@ -36,7 +41,8 @@ Optional subsystem that enables hybrid or full ray traced shadows via Ray Query. - Mode 2: Ray Query only (no shadow maps). ### Notes & Caveats -- BLAS cache key is the vertex buffer handle; if you rebuild meshes in-place, BLAS must be invalidated. +- BLAS cache key is the `MeshAsset*`. If you destroy or rebuild meshes (or their GPU buffers) you must invalidate associated BLAS via `RayTracingManager::removeBLASForMesh(mesh)` or `removeBLASForBuffer(vertexBuffer)`. - CPU→GPU memory is used for the TLAS instance buffer to simplify updates. On some platforms, you may prefer staging + device-local. +- Because BLAS builds are asynchronous and capped per frame, newly spawned meshes may take a few frames before they appear in the ray-traced path; this is a deliberate tradeoff to avoid large hitches when many meshes are introduced. - The RT path requires Vulkan 1.2+ with Ray Query and Acceleration Structure features available. diff --git a/docs/RenderGraph.md b/docs/RenderGraph.md index be1506d..c6768e3 100644 --- a/docs/RenderGraph.md +++ b/docs/RenderGraph.md @@ -111,8 +111,8 @@ Buffer usage → stage/access examples: ### Built‑In Pass Wiring (Current) -- Resource uploads (if any) → Background (compute) → Geometry (G‑Buffer) → Lighting (deferred) → Transparent → CopyToSwapchain → ImGui → PreparePresent. -- See registrations: `src/core/engine.cpp:321`–`src/core/engine.cpp:352`. +- Resource uploads (if any) → Background (compute) → Geometry (G‑Buffer) → Lighting (deferred) → SSR → Tonemap+Bloom → FXAA → Transparent → CopyToSwapchain → ImGui → PreparePresent. +- See registrations in `src/core/engine.cpp`. ### Notes & Limits diff --git a/docs/RenderPasses.md b/docs/RenderPasses.md index c3fa13a..a749f36 100644 --- a/docs/RenderPasses.md +++ b/docs/RenderPasses.md @@ -68,10 +68,14 @@ addPass(std::move(myPass)); ### Built-in Passes - Background (compute): Declares `ComputeWrite(drawImage)` and dispatches a selected effect instance. -- Geometry (G-Buffer): Declares 3 color attachments and `DepthAttachment`, plus buffer reads for shared index/vertex buffers. -- Lighting (deferred): Reads G‑Buffer as sampled images and writes to `drawImage`. +- Geometry (G-Buffer): Declares 4 color attachments (position, normal+roughness, albedo+metallic, AO+emissive) and `DepthAttachment`, plus buffer reads for shared index/vertex buffers. +- Lighting (deferred): Reads G‑Buffer as sampled images and writes to `drawImage`. Applies AO to indirect lighting and adds emissive contribution. - Shadows: Cascaded shadow maps render to per-frame transient depth images (four cascades). If Ray Query is enabled, the lighting pass additionally samples TLAS to evaluate shadow visibility according to the selected mode. +- SSR (Screen Space Reflections): Reads HDR lighting result + G-Buffer and outputs reflections blended with the scene. + Two variants: `ssr.nort` (screen-space only) and `ssr.rt` (SSR + RT fallback using TLAS ray queries). +- Tonemap + Bloom: Converts HDR to LDR with exposure control and optional bloom. Supports Reinhard and ACES tonemapping. +- FXAA: Post-process anti-aliasing on the LDR tonemapped image. Simple 5-tap edge-detection blur. - Transparent (forward): Writes to `drawImage` with depth test against `depthImage` after lighting. - ImGui: Inserted just before present to draw on the swapchain image. @@ -90,3 +94,83 @@ addPass(std::move(myPass)); See also: `docs/RenderGraph.md` for the builder API and synchronization details. +--- + +## Post-Processing Pipeline + +After deferred lighting, the engine runs a post-processing chain: SSR → Tonemap (with Bloom) → FXAA → Present. + +### SSR (Screen Space Reflections) + +Located in `src/render/passes/ssr.cpp` and `shaders/ssr.frag` / `shaders/ssr_rt.frag`. + +**Algorithm:** +- World-space ray marching along the reflection vector `R = reflect(-V, N)`. +- Depth comparison against G-Buffer position to find intersection. +- Fresnel (Schlick) and glossiness-based blending with the base HDR color. + +**Parameters (shader constants):** +- `MAX_STEPS = 64` – maximum ray march iterations (reduced for rough surfaces). +- `STEP_LENGTH = 0.5` – world units per step. +- `MAX_DISTANCE = 50.0` – maximum ray travel distance. +- `THICKNESS = 3.0` – depth tolerance for hit detection. + +**Variants:** +- `ssr.nort` – Pure screen-space reflections. +- `ssr.rt` – SSR + RT fallback using TLAS ray queries when SSR misses (requires `GL_EXT_ray_query`). + Reflection mode controlled via `sceneData.rtOptions.w`: 0 = SSR only, 1 = SSR + RT fallback, 2 = RT only. + +**Inputs (set=1):** +- binding 0: `hdrColor` – HDR lighting result. +- binding 1: `posTex` – G-Buffer world position (RGBA32F). +- binding 2: `normalTex` – G-Buffer normal + roughness. +- binding 3: `albedoTex` – G-Buffer albedo + metallic. + +--- + +### Tonemap + Bloom + +Located in `src/render/passes/tonemap.cpp` and `shaders/tonemap.frag`. + +**Tonemapping modes:** +- `mode = 0` – Reinhard: `x / (1 + x)`. +- `mode = 1` – ACES (Narkowicz approximation, default). + +**Bloom:** +- Simple gather-based bloom computed in HDR space before tonemapping. +- 5×5 kernel (radius=2) samples neighbors; pixels exceeding `bloomThreshold` contribute weighted by their brightness. +- Accumulated bloom is multiplied by `bloomIntensity` and added to the HDR color. + +**Runtime parameters:** +- `exposure` (default 1.0) – exposure multiplier. +- `bloomEnabled` (default true) – toggle bloom. +- `bloomThreshold` (default 1.0) – brightness threshold for bloom contribution. +- `bloomIntensity` (default 0.7) – bloom blend strength. + +**Output:** LDR image (`VK_FORMAT_R8G8B8A8_UNORM`) with gamma correction (γ = 2.2). + +--- + +### FXAA (Fast Approximate Anti-Aliasing) + +Located in `src/render/passes/fxaa.cpp` and `shaders/fxaa.frag`. + +**Algorithm:** +- Luma-based edge detection using a 5-tap cross pattern (N, S, E, W, center). +- If luma range exceeds threshold, apply a simple box blur; otherwise pass through. + +**Runtime parameters:** +- `enabled` (default true) – toggle FXAA. +- `edge_threshold` (default 0.125) – relative contrast threshold. +- `edge_threshold_min` (default 0.0312) – absolute minimum threshold. + +**Push constants:** +```glsl +layout(push_constant) uniform Push { + float inverse_width; + float inverse_height; + float edge_threshold; + float edge_threshold_min; +} pc; +``` + diff --git a/docs/SHADERS.md b/docs/SHADERS.md index 1865847..a17e135 100644 --- a/docs/SHADERS.md +++ b/docs/SHADERS.md @@ -23,9 +23,36 @@ - `vec4 metal_rough_factors; // x = metallic, y = roughness` - `vec4 extra[14]; // extra[0].x = normalScale` - Material texture bindings (set=1): - - binding=1 `colorTex`, binding=2 `metalRoughTex`, binding=3 `normalMap`. + - binding=1 `colorTex`, binding=2 `metalRoughTex`, binding=3 `normalMap`, binding=4 `occlusionTex`, binding=5 `emissiveTex`. - Adding a pipeline (graphics) - Fill `GraphicsPipelineCreateInfo` with shader paths, descriptor set layouts, optional push constants, and a `configure(PipelineBuilder&)` callback to set topology, raster, depth/blend, and attachment formats. - Register with `PipelineManager::createGraphicsPipeline(name, info)`. Retrieve via `getGraphics` or `getMaterialPipeline`. +Shader File Reference + +| File | Stage | Description | +|------|-------|-------------| +| `fullscreen.vert` | Vertex | Fullscreen triangle for post-process passes | +| `mesh.vert` | Vertex | Standard mesh vertex transform | +| `mesh.frag` | Fragment | Forward shading (deprecated, use gbuffer) | +| `gbuffer.frag` | Fragment | G-Buffer output (position, normal, albedo, AO+emissive) | +| `deferred_lighting.frag` | Fragment | Deferred lighting with RT shadows + IBL | +| `deferred_lighting_nort.frag` | Fragment | Deferred lighting without RT | +| `shadow.vert/.frag` | Vertex/Fragment | Shadow map generation | +| `ssr.frag` | Fragment | Screen-space reflections (ray march) | +| `ssr_rt.frag` | Fragment | SSR + RT fallback (ray query) | +| `tonemap.frag` | Fragment | HDR→LDR tonemapping + bloom | +| `fxaa.frag` | Fragment | Fast approximate anti-aliasing | +| `sky.comp` | Compute | Procedural sky background | +| `gradient_color.comp` | Compute | Gradient background | +| `background_env.frag` | Fragment | Environment map background | + +GLSL Includes + +| File | Purpose | +|------|---------| +| `input_structures.glsl` | SceneData UBO, material bindings, light structs | +| `lighting_common.glsl` | BRDF evaluation, point light helpers | +| `ibl_common.glsl` | IBL split-sum, SH irradiance | + diff --git a/docs/materials.md b/docs/materials.md index 80ee22b..1a051b4 100644 --- a/docs/materials.md +++ b/docs/materials.md @@ -1,20 +1,27 @@ Materials and Textures Overview (PBR) -Current state (as of Nov 1, 2025) - PBR textures bound per material (set=1): - binding=0: GLTFMaterialData (UBO) - binding=1: `colorTex` (albedo/base color) — sRGB - binding=2: `metalRoughTex` (G=roughness, B=metallic) — UNORM - binding=3: `normalMap` (tangent-space normal, UNORM) + - binding=4: `occlusionTex` (ambient occlusion, R channel) — UNORM + - binding=5: `emissiveTex` (emissive RGB) — sRGB - G‑Buffer writes world‑space normals. Tangent‑space normal maps are decoded with TBN using a sign‑correct bitangent (B = sign * cross(N, T)). - Numeric fallbacks via `MaterialConstants` (CPU) / `GLTFMaterialData` (GPU): - `colorFactors` (RGBA). Defaults to 1 if zero. - `metal_rough_factors` (X=metallic, Y=roughness). Roughness is clamped to ≥ 0.04 in shaders. - `extra[0].x` = `normalScale` (scalar, default 1.0). Multiplies the XY of decoded normal. + - `extra[0].y` = `aoStrength` (scalar, 0–1). Controls AO influence. + - `extra[0].z` = `hasAO` (flag, 1 = use AO texture, 0 = skip). + - `extra[1].rgb` = `emissiveFactor` (vec3). Multiplied with emissive texture. + - `extra[2].x` = `alphaCutoff` (scalar). For MASK alpha mode. - Defaults when a texture is missing: - Albedo → checkerboard error texture - MR → white (no effect) - Normal → 1×1 flat normal (0.5, 0.5, 1.0) + - Occlusion → 1×1 white (AO = 1.0, no occlusion) + - Emissive → 1×1 black (no emission) Implications for primitive meshes - Primitives can use: @@ -41,3 +48,14 @@ Usage Examples - Adjust normal strength per material: set `material.constants.extra[0].x` (CPU) or `normalTexture.scale` in glTF. - Primitive with PBR textures: - Set `MeshMaterialDesc::Kind::Textured` and fill `albedoPath`, `metalRoughPath`, and `normalPath`. + +G-Buffer Outputs +- The geometry pass (`gbuffer.frag`) writes 4 render targets: + - `outPos` (location 0): World position (xyz) + valid flag (w=1). + - `outNorm` (location 1): World normal (xyz) + roughness (w). + - `outAlbedo` (location 2): Albedo (rgb) + metallic (a). + - `outExtra` (location 3): AO (x) + emissive (yzw). +- Deferred lighting reads these and computes: + ```glsl + vec3 color = direct + indirect * ao + emissive; + ``` diff --git a/src/core/engine.cpp b/src/core/engine.cpp index 232e047..e91d119 100644 --- a/src/core/engine.cpp +++ b/src/core/engine.cpp @@ -1185,6 +1185,10 @@ void VulkanEngine::run() // Safe to destroy any BLAS queued for deletion now that the previous frame is idle. if (_rayManager) { _rayManager->flushPendingDeletes(); } + // Progress queued BLAS builds over multiple frames to avoid large + // stalls when many meshes require ray tracing structures at once. + if (_rayManager) { _rayManager->pump_blas_builds(1); } + // Commit any completed async IBL load now that the GPU is idle. if (_iblManager && _pendingIBLRequest.active) { diff --git a/src/core/raytracing/raytracing.cpp b/src/core/raytracing/raytracing.cpp index e2fbd3c..a722f22 100644 --- a/src/core/raytracing/raytracing.cpp +++ b/src/core/raytracing/raytracing.cpp @@ -35,6 +35,8 @@ void RayTracingManager::cleanup() VkDevice dv = _device->device(); // Destroy any deferred BLAS first flushPendingDeletes(); + _blasBuildQueue.clear(); + _blasPendingMeshes.clear(); if (_tlas.handle) { @@ -100,21 +102,53 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptrdeferred_uploads() && _resources->has_pending_uploads()) - { - fmt::println("[RT] getOrBuildBLAS: flushing pending resource uploads before BLAS build"); - _resources->process_queued_uploads_immediate(); - } + const MeshAsset* key = mesh.get(); - if (auto it = _blasByMesh.find(mesh.get()); it != _blasByMesh.end()) + // If a BLAS is already cached (even an empty sentinel), return it directly. + if (auto it = _blasByMesh.find(key); it != _blasByMesh.end()) { fmt::println("[RT] getOrBuildBLAS reuse by mesh mesh='{}' handle={}", mesh->name, static_cast(it->second.handle)); return it->second; } + // If a build is already queued or in progress for this mesh, do not enqueue + // another job; simply report "not ready yet". + if (_blasPendingMeshes.find(key) != _blasPendingMeshes.end()) + { + fmt::println("[RT] getOrBuildBLAS pending build mesh='{}'", mesh->name); + return {}; + } + + // If uploads are deferred, ensure any pending mesh buffer uploads are flushed + // before queuing a BLAS that will read from those GPU buffers. + if (_resources && _resources->deferred_uploads() && _resources->has_pending_uploads()) + { + fmt::println("[RT] getOrBuildBLAS: flushing pending resource uploads before queuing BLAS build"); + _resources->process_queued_uploads_immediate(); + } + + fmt::println("[RT] getOrBuildBLAS queue build mesh='{}'", mesh->name); + _blasPendingMeshes.insert(key); + _blasBuildQueue.push_back(PendingBlasBuild{key}); + + // BLAS will be built asynchronously by pump_blas_builds(); until then, + // callers should treat the empty handle as "not ready yet". + return {}; +} + +AccelStructureHandle RayTracingManager::build_blas_for_mesh(const MeshAsset *mesh) +{ + if (!mesh || !_resources || !_device) return {}; + + // If uploads are deferred, ensure any pending mesh buffer uploads are flushed + // before building a BLAS that reads from those GPU buffers. + if (_resources->deferred_uploads() && _resources->has_pending_uploads()) + { + fmt::println("[RT] build_blas_for_mesh: flushing pending resource uploads before BLAS build"); + _resources->process_queued_uploads_immediate(); + } + // Build BLAS with one geometry per surface (skip empty primitives) std::vector geoms; std::vector ranges; @@ -126,7 +160,7 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptrmeshBuffers.vertexCount; VkBuffer vb = mesh->meshBuffers.vertexBuffer.buffer; - fmt::println("[RT] getOrBuildBLAS build mesh='{}' surfaces={} vcount={}", mesh->name, + fmt::println("[RT] build_blas_for_mesh mesh='{}' surfaces={} vcount={}", mesh->name, mesh->surfaces.size(), vcount); for (const auto &s: mesh->surfaces) @@ -162,9 +196,11 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptrname); + _blasByMesh.emplace(mesh, AccelStructureHandle{}); return {}; } @@ -231,10 +267,50 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptrdevice(), &dai); - _blasByMesh.emplace(mesh.get(), blas); + _blasByMesh.emplace(mesh, blas); return blas; } +void RayTracingManager::pump_blas_builds(uint32_t max_builds_per_frame) +{ + if (max_builds_per_frame == 0 || _blasBuildQueue.empty()) + { + return; + } + + uint32_t built = 0; + + while (built < max_builds_per_frame && !_blasBuildQueue.empty()) + { + PendingBlasBuild job = _blasBuildQueue.front(); + _blasBuildQueue.pop_front(); + + const MeshAsset* mesh = job.mesh; + if (mesh) + { + // Drop the pending flag for this mesh now; if the build ends up + // with an empty handle, getOrBuildBLAS will see the cache entry + // (including the empty sentinel) and avoid re-queuing. + _blasPendingMeshes.erase(mesh); + + // Skip if a BLAS was already created meanwhile. + if (_blasByMesh.find(mesh) == _blasByMesh.end()) + { + AccelStructureHandle blas = build_blas_for_mesh(mesh); + if (blas.handle) + { + ++built; + } + } + } + else + { + // Mesh pointer is null; just drop the pending flag. + _blasPendingMeshes.erase(mesh); + } + } +} + void RayTracingManager::ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize /*requiredScratch*/, DeletionQueue& dq) { // Recreate TLAS storage if size grows. Defer destruction to the frame DQ to @@ -296,7 +372,10 @@ VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const Dra } else { - // Try to build on the fly if the mesh is still alive (non-owning shared_ptr wrapper). + // Queue an async BLAS build if the mesh is still alive + // (non-owning shared_ptr wrapper). The BLAS will be built + // over subsequent frames by pump_blas_builds(); until then, + // this instance will be skipped. std::shared_ptr nonOwning(const_cast(r.sourceMesh), [](MeshAsset *) {}); blas = getOrBuildBLAS(nonOwning); } @@ -423,6 +502,24 @@ void RayTracingManager::removeBLASForBuffer(VkBuffer vertexBuffer) { if (!vertexBuffer) return; + // Drop any queued builds referencing this vertex buffer. + if (!_blasBuildQueue.empty()) + { + for (auto itQ = _blasBuildQueue.begin(); itQ != _blasBuildQueue.end(); ) + { + const MeshAsset* mesh = itQ->mesh; + if (mesh && mesh->meshBuffers.vertexBuffer.buffer == vertexBuffer) + { + _blasPendingMeshes.erase(mesh); + itQ = _blasBuildQueue.erase(itQ); + } + else + { + ++itQ; + } + } + } + // Find any mesh whose vertex buffer matches and evict its BLAS. for (auto it = _blasByMesh.begin(); it != _blasByMesh.end(); ) { @@ -443,6 +540,24 @@ void RayTracingManager::removeBLASForBuffer(VkBuffer vertexBuffer) void RayTracingManager::removeBLASForMesh(const MeshAsset *mesh) { if (!mesh) return; + + // Drop any queued builds for this mesh. + if (!_blasBuildQueue.empty()) + { + for (auto itQ = _blasBuildQueue.begin(); itQ != _blasBuildQueue.end(); ) + { + if (itQ->mesh == mesh) + { + itQ = _blasBuildQueue.erase(itQ); + } + else + { + ++itQ; + } + } + } + _blasPendingMeshes.erase(mesh); + auto it = _blasByMesh.find(mesh); if (it == _blasByMesh.end()) return; diff --git a/src/core/raytracing/raytracing.h b/src/core/raytracing/raytracing.h index e98bad9..30bc904 100644 --- a/src/core/raytracing/raytracing.h +++ b/src/core/raytracing/raytracing.h @@ -1,20 +1,22 @@ #pragma once - #include - #include - #include - #include - - class DeviceManager; - class ResourceManager; - struct DrawContext; - struct MeshAsset; - - struct AccelStructureHandle { - VkAccelerationStructureKHR handle{VK_NULL_HANDLE}; - AllocatedBuffer storage{}; // buffer that backs the AS - VkDeviceAddress deviceAddress{0}; - }; - +#include +#include +#include +#include +#include +#include + +class DeviceManager; +class ResourceManager; +struct DrawContext; +struct MeshAsset; + +struct AccelStructureHandle { + VkAccelerationStructureKHR handle{VK_NULL_HANDLE}; + AllocatedBuffer storage{}; // buffer that backs the AS + VkDeviceAddress deviceAddress{0}; +}; + // Ray tracing helper that caches BLAS per mesh and rebuilds TLAS per frame // for hybrid/full ray query shadows. See docs/RayTracing.md. class RayTracingManager { @@ -22,8 +24,16 @@ public: void init(DeviceManager* dev, ResourceManager* res); void cleanup(); - // Build (or get) BLAS for a mesh. Safe to call multiple times. - AccelStructureHandle getOrBuildBLAS(const std::shared_ptr& mesh); + // Queue a BLAS build for a mesh (if not already built or queued) and + // return the cached handle when available. Safe to call multiple times. + // When builds are pending, this may return an empty handle; callers + // should treat that as "BLAS not ready yet" and skip ray instances. + AccelStructureHandle getOrBuildBLAS(const std::shared_ptr& mesh); + + // Progress asynchronous BLAS builds. Call once per frame after waiting + // for the previous frame's GPU fence. max_builds_per_frame controls how + // many BLAS are built in this pump to spread work over multiple frames. + void pump_blas_builds(uint32_t max_builds_per_frame = 1); // Rebuild TLAS from current draw context; returns TLAS handle (or null if unavailable) // Destruction of previous TLAS resources is deferred via the provided frame deletion queue @@ -41,12 +51,12 @@ public: void removeBLASForMesh(const MeshAsset *mesh); private: - // function pointers (resolved on init) - PFN_vkCreateAccelerationStructureKHR _vkCreateAccelerationStructureKHR{}; - PFN_vkDestroyAccelerationStructureKHR _vkDestroyAccelerationStructureKHR{}; - PFN_vkGetAccelerationStructureBuildSizesKHR _vkGetAccelerationStructureBuildSizesKHR{}; - PFN_vkCmdBuildAccelerationStructuresKHR _vkCmdBuildAccelerationStructuresKHR{}; - PFN_vkGetAccelerationStructureDeviceAddressKHR _vkGetAccelerationStructureDeviceAddressKHR{}; + // function pointers (resolved on init) + PFN_vkCreateAccelerationStructureKHR _vkCreateAccelerationStructureKHR{}; + PFN_vkDestroyAccelerationStructureKHR _vkDestroyAccelerationStructureKHR{}; + PFN_vkGetAccelerationStructureBuildSizesKHR _vkGetAccelerationStructureBuildSizesKHR{}; + PFN_vkCmdBuildAccelerationStructuresKHR _vkCmdBuildAccelerationStructuresKHR{}; + PFN_vkGetAccelerationStructureDeviceAddressKHR _vkGetAccelerationStructureDeviceAddressKHR{}; DeviceManager* _device{nullptr}; ResourceManager* _resources{nullptr}; @@ -55,6 +65,16 @@ private: // when a mesh is destroyed or its GPU buffers are freed, the owning code // must call removeBLASForMesh/removeBLASForBuffer to drop the cached BLAS. std::unordered_map _blasByMesh; + + struct PendingBlasBuild + { + const MeshAsset* mesh{nullptr}; + }; + + // Queue of BLAS builds to execute over multiple frames. + std::deque _blasBuildQueue; + // Tracks meshes that have a queued or in-progress BLAS build. + std::unordered_set _blasPendingMeshes; // TLAS + scratch / instance buffer (rebuilt per frame) AccelStructureHandle _tlas{}; @@ -68,5 +88,6 @@ private: VkDeviceSize _minScratchAlignment{256}; void ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize requiredScratch, DeletionQueue& frameDQ); + AccelStructureHandle build_blas_for_mesh(const MeshAsset* mesh); };