diff --git a/shaders/deferred_lighting_nort.frag b/shaders/deferred_lighting_nort.frag new file mode 100644 index 0000000..e452a4c --- /dev/null +++ b/shaders/deferred_lighting_nort.frag @@ -0,0 +1,273 @@ +#version 450 +#extension GL_GOOGLE_include_directive : require +#include "input_structures.glsl" + +layout(location=0) in vec2 inUV; +layout(location=0) out vec4 outColor; + +layout(set=1, binding=0) uniform sampler2D posTex; +layout(set=1, binding=1) uniform sampler2D normalTex; +layout(set=1, binding=2) uniform sampler2D albedoTex; +layout(set=2, binding=0) uniform sampler2D shadowTex[4]; + +// Tunables for shadow quality and blending +// Border smoothing width in light-space NDC (0..1). Larger = wider cross-fade. +const float SHADOW_BORDER_SMOOTH_NDC = 0.08; +// Base PCF radius in texels for cascade 0; higher cascades scale this up slightly. +const float SHADOW_PCF_BASE_RADIUS = 1.35; +// Additional per-cascade radius scale for coarser cascades (0..1 factor added across levels) +const float SHADOW_PCF_CASCADE_GAIN = 2.0; // extra radius at far end +// Receiver normal-based offset to reduce acne (in world units) +const float SHADOW_NORMAL_OFFSET = 0.0025; +// Scale for receiver-plane depth bias term (tweak if over/under biased) +const float SHADOW_RPDB_SCALE = 1.0; +// Minimum clamp to keep a tiny bias even on perpendicular receivers +const float SHADOW_MIN_BIAS = 1e-5; + +const float PI = 3.14159265359; + +float hash12(vec2 p) +{ + vec3 p3 = fract(vec3(p.xyx) * 0.1031); + p3 += dot(p3, p3.yzx + 33.33); return fract((p3.x + p3.y) * p3.z); +} + +const vec2 POISSON_16[16] = vec2[16]( +vec2(0.2852, -0.1883), vec2(-0.1464, 0.2591), +vec2(-0.3651, -0.0974), vec2(0.0901, 0.3807), +vec2(0.4740, 0.0679), vec2(-0.0512, -0.4466), +vec2(-0.4497, 0.1673), vec2(0.3347, 0.3211), +vec2(0.1948, -0.4196), vec2(-0.2919, -0.3291), +vec2(-0.0763, 0.4661), vec2(0.4421, -0.2217), +vec2(0.0281, -0.2468), vec2(-0.2104, 0.0573), +vec2(0.1197, 0.0779), vec2(-0.0905, -0.1203) +); + +// Compute primary cascade and an optional neighbor for cross-fade near borders +struct CascadeMix { uint i0; uint i1; float w1; }; + +CascadeMix computeCascadeMix(vec3 worldPos) +{ + uint primary = 3u; + vec3 ndcP = vec3(0); + for (uint i = 0u; i < 4u; ++i) + { + vec4 lclip = sceneData.lightViewProjCascades[i] * vec4(worldPos, 1.0); + vec3 ndc = lclip.xyz / max(lclip.w, 1e-6); + if (abs(ndc.x) <= 1.0 && abs(ndc.y) <= 1.0 && ndc.z >= 0.0 && ndc.z <= 1.0) + { + primary = i; + ndcP = ndc; + break; + } + } + + CascadeMix cm; cm.i0 = primary; cm.i1 = primary; cm.w1 = 0.0; + + if (primary < 3u) + { + float edge = max(abs(ndcP.x), abs(ndcP.y)); // 0..1, 1 at border + // start blending when we are within S of the border + float t = clamp((edge - (1.0 - SHADOW_BORDER_SMOOTH_NDC)) / max(SHADOW_BORDER_SMOOTH_NDC, 1e-4), 0.0, 1.0); + float w = smoothstep(0.0, 1.0, t); + + if (w > 0.0) + { + // Only blend if neighbor actually covers the point + uint neighbor = primary + 1u; + vec4 lclipN = sceneData.lightViewProjCascades[neighbor] * vec4(worldPos, 1.0); + vec3 ndcN = lclipN.xyz / max(lclipN.w, 1e-6); + bool insideN = (abs(ndcN.x) <= 1.0 && abs(ndcN.y) <= 1.0 && ndcN.z >= 0.0 && ndcN.z <= 1.0); + if (insideN) + { + cm.i1 = neighbor; + cm.w1 = w; + } + } + } + + return cm; +} + +// Compute receiver-plane depth gradient dz/duv using derivatives of shadow NDC +// Reference: Akenine-Möller et al., "Receiver Plane Depth Bias" (PCF-friendly) +vec2 receiverPlaneDepthGradient(vec3 ndc, vec3 dndc_dx, vec3 dndc_dy) +{ + // Convert XY to shadow map UV derivatives (ndc -> uv: u = 0.5*x + 0.5) + vec2 duv_dx = 0.5 * dndc_dx.xy; + vec2 duv_dy = 0.5 * dndc_dy.xy; + + // Build Jacobian J = [du/dx du/dy; dv/dx dv/dy] (column-major) + mat2 J = mat2(duv_dx.x, duv_dy.x, + duv_dx.y, duv_dy.y); + + // Depth derivatives w.r.t screen pixels + vec2 dz_dxdy = vec2(dndc_dx.z, dndc_dy.z); + + // Invert J to obtain dz/du and dz/dv. Guard against near-singular Jacobian. + float det = J[0][0] * J[1][1] - J[1][0] * J[0][1]; + if (abs(det) < 1e-8) + { + // Degenerate mapping; return zero gradient so only slope/const bias applies + return vec2(0.0); + } + + // Manual inverse for stability/perf on some drivers + mat2 invJ = (1.0 / det) * mat2( J[1][1], -J[0][1], + -J[1][0], J[0][0]); + return invJ * dz_dxdy; // (dz/du, dz/dv) +} + +float sampleCascadeShadow(uint ci, vec3 worldPos, vec3 N, vec3 L) +{ + mat4 lightMat = sceneData.lightViewProjCascades[ci]; + + vec4 lclip = lightMat * vec4(worldPos, 1.0); + vec3 ndc = lclip.xyz / lclip.w; + vec2 suv = ndc.xy * 0.5 + 0.5; + + if (any(lessThan(suv, vec2(0.0))) || any(greaterThan(suv, vec2(1.0)))) + return 1.0; + + float current = clamp(ndc.z, 0.0, 1.0); + + // Slope-based tiny baseline bias (cheap safety net) + float NoL = max(dot(N, L), 0.0); + float slopeBias = max(0.0006 * (1.0 - NoL), SHADOW_MIN_BIAS); + + // Receiver-plane depth gradient in shadow UV space + vec3 dndc_dx = dFdx(ndc); + vec3 dndc_dy = dFdy(ndc); + vec2 dz_duv = receiverPlaneDepthGradient(ndc, dndc_dx, dndc_dy); + + ivec2 dim = textureSize(shadowTex[ci], 0); + vec2 texelSize = 1.0 / vec2(dim); + + float baseRadius = SHADOW_PCF_BASE_RADIUS; + float radius = mix(baseRadius, baseRadius + SHADOW_PCF_CASCADE_GAIN, float(ci) / 3.0); + + float ang = hash12(suv * 4096.0) * 6.2831853; + vec2 r = vec2(cos(ang), sin(ang)); + mat2 rot = mat2(r.x, -r.y, r.y, r.x); + + const int TAP_COUNT = 16; + float visible = 0.0; + float wsum = 0.0; + + for (int i = 0; i < TAP_COUNT; ++i) + { + vec2 pu = rot * POISSON_16[i]; + vec2 off = pu * radius * texelSize; // uv-space offset of this tap + + float pr = length(pu); + float w = 1.0 - smoothstep(0.0, 0.65, pr); + + float mapD = texture(shadowTex[ci], suv + off).r; + + // Receiver-plane depth bias: conservative depth delta over this tap's offset + // Approximate |Δz| ≈ |dz/du|*|Δu| + |dz/dv|*|Δv| + float rpdb = dot(abs(dz_duv), abs(off)) * SHADOW_RPDB_SCALE; + + float vis = step(mapD, current + slopeBias + rpdb); + + visible += vis * w; + wsum += w; + } + + float visibility = (wsum > 0.0) ? (visible / wsum) : 1.0; + return visibility; +} + +float calcShadowVisibility(vec3 worldPos, vec3 N, vec3 L) +{ + vec3 wp = worldPos + N * SHADOW_NORMAL_OFFSET * (0.5 + 0.5 * (1.0 - max(dot(N, L), 0.0))); + + CascadeMix cm = computeCascadeMix(wp); + float v0 = sampleCascadeShadow(cm.i0, wp, N, L); + if (cm.w1 <= 0.0) + return v0; + + float v1 = sampleCascadeShadow(cm.i1, wp, N, L); + return mix(v0, v1, clamp(cm.w1, 0.0, 1.0)); +} + +vec3 fresnelSchlick(float cosTheta, vec3 F0) +{ + return F0 + (1.0 - F0) * pow(1.0 - cosTheta, 5.0); +} + +float DistributionGGX(vec3 N, vec3 H, float roughness) +{ + float a = roughness * roughness; + float a2 = a * a; + float NdotH = max(dot(N, H), 0.0); + float NdotH2 = NdotH * NdotH; + + float num = a2; + float denom = (NdotH2 * (a2 - 1.0) + 1.0); + denom = PI * denom * denom; + + return num / max(denom, 0.001); +} + +float GeometrySchlickGGX(float NdotV, float roughness) +{ + float r = (roughness + 1.0); + float k = (r * r) / 8.0; + + float denom = NdotV * (1.0 - k) + k; + return NdotV / max(denom, 0.001); +} + +float GeometrySmith(vec3 N, vec3 V, vec3 L, float roughness) +{ + float ggx2 = GeometrySchlickGGX(max(dot(N, V), 0.0), roughness); + float ggx1 = GeometrySchlickGGX(max(dot(N, L), 0.0), roughness); + return ggx1 * ggx2; +} + +void main(){ + vec4 posSample = texture(posTex, inUV); + if (posSample.w == 0.0) + { + outColor = vec4(0.0); + return; + } + + vec3 pos = posSample.xyz; + vec4 normalSample = texture(normalTex, inUV); + vec3 N = normalize(normalSample.xyz); + float roughness = clamp(normalSample.w, 0.04, 1.0); + + vec4 albedoSample = texture(albedoTex, inUV); + vec3 albedo = albedoSample.rgb; + float metallic = clamp(albedoSample.a, 0.0, 1.0); + + vec3 camPos = vec3(inverse(sceneData.view)[3]); + vec3 V = normalize(camPos - pos); + vec3 L = normalize(-sceneData.sunlightDirection.xyz); + vec3 H = normalize(V + L); + + vec3 F0 = mix(vec3(0.04), albedo, metallic); + vec3 F = fresnelSchlick(max(dot(H, V), 0.0), F0); + float NDF = DistributionGGX(N, H, roughness); + float G = GeometrySmith(N, V, L, roughness); + + vec3 numerator = NDF * G * F; + float denom = 4.0 * max(dot(N, V), 0.0) * max(dot(N, L), 0.0); + vec3 specular = numerator / max(denom, 0.001); + + vec3 kS = F; + vec3 kD = (1.0 - kS) * (1.0 - metallic); + + float NdotL = max(dot(N, L), 0.0); + // Shadowing (directional, forward-Z shadow map) + float visibility = calcShadowVisibility(pos, N, L); + + vec3 irradiance = sceneData.sunlightColor.rgb * sceneData.sunlightColor.a * NdotL * visibility; + + vec3 color = (kD * albedo / PI + specular) * irradiance; + color += albedo * sceneData.ambientColor.rgb; + + outColor = vec4(color, 1.0); +} diff --git a/src/compute/vk_compute.cpp b/src/compute/vk_compute.cpp index 15207c2..8e4d118 100644 --- a/src/compute/vk_compute.cpp +++ b/src/compute/vk_compute.cpp @@ -6,6 +6,7 @@ #include "vk_device.h" #include "core/vk_resource.h" +#include "frame_resources.h" ComputeBinding ComputeBinding::uniformBuffer(uint32_t binding, VkBuffer buffer, VkDeviceSize size, VkDeviceSize offset) { @@ -354,9 +355,22 @@ void ComputeManager::dispatchInstance(VkCommandBuffer cmd, const std::string &in vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipeline()); - updateDescriptorSet(it->second.descriptorSet, it->second.bindings); - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getLayout(), 0, 1, &it->second.descriptorSet, - 0, nullptr); + // Allocate a transient per-frame descriptor set to avoid updating a set + // that might still be in use by a previous in-flight frame. + VkDescriptorSet transientSet = context->currentFrame + ? context->currentFrame->_frameDescriptors.allocate(context->getDevice()->device(), pipeline.descriptorLayout) + : VK_NULL_HANDLE; + if (transientSet == VK_NULL_HANDLE) + { + // Fallback to instance-owned set if per-frame allocator unavailable + updateDescriptorSet(it->second.descriptorSet, it->second.bindings); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getLayout(), 0, 1, &it->second.descriptorSet, 0, nullptr); + } + else + { + updateDescriptorSet(transientSet, it->second.bindings); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getLayout(), 0, 1, &transientSet, 0, nullptr); + } if (dispatchInfo.pushConstants && dispatchInfo.pushConstantSize > 0) { @@ -459,9 +473,22 @@ bool ComputeManager::createPipeline(const std::string &name, const ComputePipeli DescriptorLayoutBuilder layoutBuilder; for (size_t i = 0; i < createInfo.descriptorTypes.size(); ++i) { - layoutBuilder.add_binding(i, createInfo.descriptorTypes[i]); + layoutBuilder.add_binding(static_cast(i), createInfo.descriptorTypes[i]); } - computePipeline.descriptorLayout = layoutBuilder.build(context->getDevice()->device(), VK_SHADER_STAGE_COMPUTE_BIT); + + // Mark all compute bindings as UPDATE_AFTER_BIND so we can update + // persistent instance descriptor sets while a previous frame is in-flight. + std::vector bindingFlags(createInfo.descriptorTypes.size(), + VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT); + VkDescriptorSetLayoutBindingFlagsCreateInfo flagsCI{ VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO }; + flagsCI.bindingCount = static_cast(bindingFlags.size()); + flagsCI.pBindingFlags = bindingFlags.data(); + + computePipeline.descriptorLayout = layoutBuilder.build( + context->getDevice()->device(), + VK_SHADER_STAGE_COMPUTE_BIT, + &flagsCI, + VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT); } VkPipelineLayoutCreateInfo layoutInfo = vkinit::pipeline_layout_create_info(); diff --git a/src/core/config.h b/src/core/config.h index f62d2a1..0dc1342 100644 --- a/src/core/config.h +++ b/src/core/config.h @@ -7,6 +7,22 @@ inline constexpr bool kUseValidationLayers = false; inline constexpr bool kUseValidationLayers = true; #endif +// VMA diagnostics (stats prints + JSON dumps + allocation naming) +// - Default: disabled to avoid noise and I/O at shutdown. +// - Enable at runtime by setting environment variable `VE_VMA_DEBUG=1`. +#include +inline constexpr bool kEnableVmaDebugByDefault = false; +inline bool vmaDebugEnabled() +{ + const char *env = std::getenv("VE_VMA_DEBUG"); + if (env && *env) + { + // Accept 1/true/yes (case-insensitive) + return (*env == '1') || (*env == 'T') || (*env == 't') || (*env == 'Y') || (*env == 'y'); + } + return kEnableVmaDebugByDefault; +} + // Shadow mapping configuration inline constexpr int kShadowCascadeCount = 4; // Maximum shadow distance for CSM in view-space units @@ -22,9 +38,9 @@ inline constexpr float kShadowCascadeRadiusMargin = 10.0f; inline constexpr float kShadowClipBaseRadius = 20.0f; // When using dynamic pullback, compute it from the covered XY range of each level. // pullback = max(kShadowClipPullbackMin, cover * kShadowClipPullbackFactor) -inline constexpr float kShadowClipPullbackFactor = 2.5f; // fraction of XY half-size behind center -inline constexpr float kShadowClipForwardFactor = 2.5f; // fraction of XY half-size in front of center for zFar -inline constexpr float kShadowClipPullbackMin = 160.0f; // lower bound on pullback so near levels don’t collapse +inline constexpr float kShadowClipPullbackFactor = 1.5f; // fraction of XY half-size behind center +inline constexpr float kShadowClipForwardFactor = 1.5f; // fraction of XY half-size in front of center for zFar +inline constexpr float kShadowClipPullbackMin = 40.0f; // lower bound on pullback so near levels don’t collapse // Additional Z padding for the orthographic frustum along light direction inline constexpr float kShadowClipZPadding = 40.0f; diff --git a/src/core/vk_descriptor_manager.cpp b/src/core/vk_descriptor_manager.cpp index 5ab9232..d71c8fb 100644 --- a/src/core/vk_descriptor_manager.cpp +++ b/src/core/vk_descriptor_manager.cpp @@ -9,7 +9,9 @@ void DescriptorManager::init(DeviceManager *deviceManager) { DescriptorLayoutBuilder builder; builder.add_binding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); - _singleImageDescriptorLayout = builder.build(_deviceManager->device(), VK_SHADER_STAGE_FRAGMENT_BIT); + _singleImageDescriptorLayout = builder.build( + _deviceManager->device(), VK_SHADER_STAGE_FRAGMENT_BIT, + nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT); } { DescriptorLayoutBuilder builder; builder.add_binding(0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); @@ -19,7 +21,8 @@ void DescriptorManager::init(DeviceManager *deviceManager) builder.add_binding(1, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR); } _gpuSceneDataDescriptorLayout = builder.build( - _deviceManager->device(), VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT); + _deviceManager->device(), VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, + nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT); } } diff --git a/src/core/vk_descriptors.cpp b/src/core/vk_descriptors.cpp index 4a621e0..f4892ba 100644 --- a/src/core/vk_descriptors.cpp +++ b/src/core/vk_descriptors.cpp @@ -77,10 +77,13 @@ void DescriptorWriter::write_image(int binding, VkImageView image, VkSampler sam void DescriptorWriter::write_acceleration_structure(int binding, VkAccelerationStructureKHR as) { + // Store the handle to ensure the pointer we give to Vulkan stays valid + VkAccelerationStructureKHR &storedAS = accelHandles.emplace_back(as); + VkWriteDescriptorSetAccelerationStructureKHR &acc = accelInfos.emplace_back( VkWriteDescriptorSetAccelerationStructureKHR{ VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR }); acc.accelerationStructureCount = 1; - acc.pAccelerationStructures = &as; + acc.pAccelerationStructures = &storedAS; VkWriteDescriptorSet write{ VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; write.dstBinding = binding; @@ -95,6 +98,8 @@ void DescriptorWriter::clear() imageInfos.clear(); writes.clear(); bufferInfos.clear(); + accelInfos.clear(); + accelHandles.clear(); } void DescriptorWriter::update_set(VkDevice device, VkDescriptorSet set) @@ -118,7 +123,10 @@ void DescriptorAllocator::init_pool(VkDevice device, uint32_t maxSets, std::span } VkDescriptorPoolCreateInfo pool_info = {.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO}; - pool_info.flags = 0; + // Enable update-after-bind so descriptors used by previous frame can be + // safely rewritten (e.g., compute instances). It is valid to allocate + // non-update-after-bind sets from such a pool. + pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT; pool_info.maxSets = maxSets; pool_info.poolSizeCount = (uint32_t) poolSizes.size(); pool_info.pPoolSizes = poolSizes.data(); @@ -187,7 +195,8 @@ VkDescriptorPool DescriptorAllocatorGrowable::create_pool(VkDevice device, uint3 VkDescriptorPoolCreateInfo pool_info = {}; pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - pool_info.flags = 0; + // Use update-after-bind pools to support cross-frame rewrites. + pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT; pool_info.maxSets = setCount; pool_info.poolSizeCount = (uint32_t) poolSizes.size(); pool_info.pPoolSizes = poolSizes.data(); diff --git a/src/core/vk_descriptors.h b/src/core/vk_descriptors.h index 237eac5..42b71d1 100644 --- a/src/core/vk_descriptors.h +++ b/src/core/vk_descriptors.h @@ -20,6 +20,8 @@ struct DescriptorWriter std::deque imageInfos; std::deque bufferInfos; std::deque accelInfos; + // Keep AS handles alive so pAccelerationStructures points to valid memory + std::deque accelHandles; std::vector writes; void write_image(int binding, VkImageView image, VkSampler sampler, VkImageLayout layout, VkDescriptorType type); diff --git a/src/core/vk_device.cpp b/src/core/vk_device.cpp index 3ba888d..f39c124 100644 --- a/src/core/vk_device.cpp +++ b/src/core/vk_device.cpp @@ -30,8 +30,16 @@ void DeviceManager::init_vulkan(SDL_Window *window) features.synchronization2 = true; VkPhysicalDeviceVulkan12Features features12{.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES}; - features12.bufferDeviceAddress = true; - features12.descriptorIndexing = true; + features12.bufferDeviceAddress = VK_TRUE; + features12.descriptorIndexing = VK_TRUE; + // Enable update-after-bind related toggles for graphics/compute descriptors + features12.descriptorBindingPartiallyBound = VK_TRUE; + features12.descriptorBindingUpdateUnusedWhilePending = VK_TRUE; + features12.runtimeDescriptorArray = VK_TRUE; + features12.descriptorBindingUniformBufferUpdateAfterBind = VK_TRUE; + features12.descriptorBindingStorageBufferUpdateAfterBind = VK_TRUE; + features12.descriptorBindingSampledImageUpdateAfterBind = VK_TRUE; + features12.descriptorBindingStorageImageUpdateAfterBind = VK_TRUE; //use vkbootstrap to select a gpu. //We want a gpu that can write to the SDL surface and supports vulkan 1.3 @@ -72,14 +80,16 @@ void DeviceManager::init_vulkan(SDL_Window *window) //create the final vulkan device vkb::DeviceBuilder deviceBuilder{physicalDevice}; - // Enable ray query + accel struct features in device create pNext if supported + // Ray features are optional and enabled only if supported on the chosen GPU + VkPhysicalDeviceAccelerationStructureFeaturesKHR accelReq{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR }; + VkPhysicalDeviceRayQueryFeaturesKHR rayqReq{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR }; if (_rayQuerySupported && _accelStructSupported) { - VkPhysicalDeviceAccelerationStructureFeaturesKHR accelReq{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR }; accelReq.accelerationStructure = VK_TRUE; - VkPhysicalDeviceRayQueryFeaturesKHR rayqReq{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR }; - rayqReq.pNext = &accelReq; rayqReq.rayQuery = VK_TRUE; + rayqReq.pNext = &accelReq; + } + if (_rayQuerySupported && _accelStructSupported) { deviceBuilder.add_pNext(&rayqReq); } @@ -111,6 +121,18 @@ void DeviceManager::init_vulkan(SDL_Window *window) void DeviceManager::cleanup() { + // Optional VMA stats print + if (_allocator && vmaDebugEnabled()) + { + VmaTotalStatistics stats{}; + vmaCalculateStatistics(_allocator, &stats); + const VmaStatistics& s = stats.total.statistics; + fmt::print("[VMA] Blocks: {} | Allocations: {} | BlockBytes: {} | AllocationBytes: {}\n", + (size_t)s.blockCount, + (size_t)s.allocationCount, + (unsigned long long)s.blockBytes, + (unsigned long long)s.allocationBytes); + } vkDestroySurfaceKHR(_instance, _surface, nullptr); _deletionQueue.flush(); vkDestroyDevice(_device, nullptr); diff --git a/src/core/vk_engine.cpp b/src/core/vk_engine.cpp index 852027e..958b8e4 100644 --- a/src/core/vk_engine.cpp +++ b/src/core/vk_engine.cpp @@ -53,6 +53,46 @@ VulkanEngine *loadedEngine = nullptr; +static void print_vma_stats(DeviceManager* dev, const char* tag) +{ + if (!vmaDebugEnabled()) return; + if (!dev) return; + VmaAllocator alloc = dev->allocator(); + if (!alloc) return; + VmaTotalStatistics stats{}; + vmaCalculateStatistics(alloc, &stats); + const VmaStatistics &s = stats.total.statistics; + fmt::print("[VMA][{}] Blocks:{} Allocs:{} BlockBytes:{} AllocBytes:{}\n", + tag, + (size_t)s.blockCount, + (size_t)s.allocationCount, + (unsigned long long)s.blockBytes, + (unsigned long long)s.allocationBytes); +} + +static void dump_vma_json(DeviceManager* dev, const char* tag) +{ + if (!vmaDebugEnabled()) return; + if (!dev) return; + VmaAllocator alloc = dev->allocator(); + if (!alloc) return; + char* json = nullptr; + vmaBuildStatsString(alloc, &json, VK_TRUE); + if (json) + { + // Write to a small temp file beside the binary + std::string fname = std::string("vma_") + tag + ".json"; + FILE* f = fopen(fname.c_str(), "wb"); + if (f) + { + fwrite(json, 1, strlen(json), f); + fclose(f); + fmt::print("[VMA] Wrote {}\n", fname); + } + vmaFreeStatsString(alloc, json); + } +} + void VulkanEngine::init() { // We initialize SDL and create a window with it. @@ -150,7 +190,7 @@ void VulkanEngine::init() auto imguiPass = std::make_unique(); _renderPassManager->setImGuiPass(std::move(imguiPass)); - const std::string structurePath = _assetManager->modelPath("police_office.glb"); + const std::string structurePath = _assetManager->modelPath("seoul_high.glb"); const auto structureFile = _assetManager->loadGLTF(structurePath); assert(structureFile.has_value()); @@ -233,7 +273,11 @@ void VulkanEngine::cleanup() { vkDeviceWaitIdle(_deviceManager->device()); + print_vma_stats(_deviceManager.get(), "begin"); + _sceneManager->cleanup(); + print_vma_stats(_deviceManager.get(), "after SceneManager"); + dump_vma_json(_deviceManager.get(), "after_SceneManager"); if (_isInitialized) { @@ -253,24 +297,53 @@ void VulkanEngine::cleanup() metalRoughMaterial.clear_resources(_deviceManager->device()); _mainDeletionQueue.flush(); + print_vma_stats(_deviceManager.get(), "after MainDQ flush"); + dump_vma_json(_deviceManager.get(), "after_MainDQ"); _renderPassManager->cleanup(); + print_vma_stats(_deviceManager.get(), "after RenderPassManager"); + dump_vma_json(_deviceManager.get(), "after_RenderPassManager"); _pipelineManager->cleanup(); + print_vma_stats(_deviceManager.get(), "after PipelineManager"); + dump_vma_json(_deviceManager.get(), "after_PipelineManager"); compute.cleanup(); + print_vma_stats(_deviceManager.get(), "after Compute"); + dump_vma_json(_deviceManager.get(), "after_Compute"); _swapchainManager->cleanup(); + print_vma_stats(_deviceManager.get(), "after Swapchain"); + dump_vma_json(_deviceManager.get(), "after_Swapchain"); if (_assetManager) _assetManager->cleanup(); + print_vma_stats(_deviceManager.get(), "after AssetManager"); + dump_vma_json(_deviceManager.get(), "after_AssetManager"); + + // Ensure ray tracing resources (BLAS/TLAS/instance buffers) are freed before VMA is destroyed + if (_rayManager) { _rayManager->cleanup(); } + print_vma_stats(_deviceManager.get(), "after RTManager"); + dump_vma_json(_deviceManager.get(), "after_RTManager"); _resourceManager->cleanup(); + print_vma_stats(_deviceManager.get(), "after ResourceManager"); + dump_vma_json(_deviceManager.get(), "after_ResourceManager"); _samplerManager->cleanup(); _descriptorManager->cleanup(); + print_vma_stats(_deviceManager.get(), "after Samplers+Descriptors"); + dump_vma_json(_deviceManager.get(), "after_Samplers_Descriptors"); _context->descriptors->destroy_pools(_deviceManager->device()); + // Extra safety: flush frame deletion queues once more before destroying VMA + for (int i = 0; i < FRAME_OVERLAP; i++) + { + _frames[i]._deletionQueue.flush(); + } + + print_vma_stats(_deviceManager.get(), "before DeviceManager"); + dump_vma_json(_deviceManager.get(), "before_DeviceManager"); _deviceManager->cleanup(); SDL_DestroyWindow(_window); @@ -280,11 +353,6 @@ void VulkanEngine::cleanup() void VulkanEngine::draw() { _sceneManager->update_scene(); - // Build or update TLAS for current frame if RT mode enabled (1 or 2) - if (_rayManager && _context->shadowSettings.mode != 0u) - { - _rayManager->buildTLASFromDrawContext(_context->getMainDrawContext()); - } //> frame_clear //wait until the gpu has finished rendering the last frame. Timeout of 1 second VK_CHECK(vkWaitForFences(_deviceManager->device(), 1, &get_current_frame()._renderFence, true, 1000000000)); @@ -319,6 +387,12 @@ void VulkanEngine::draw() //now that we are sure that the commands finished executing, we can safely reset the command buffer to begin recording again. VK_CHECK(vkResetCommandBuffer(get_current_frame()._mainCommandBuffer, 0)); + // Build or update TLAS for current frame now that the previous frame is idle + if (_rayManager && _context->shadowSettings.mode != 0u) + { + _rayManager->buildTLASFromDrawContext(_context->getMainDrawContext(), get_current_frame()._deletionQueue); + } + //naming it cmd for shorter writing VkCommandBuffer cmd = get_current_frame()._mainCommandBuffer; diff --git a/src/core/vk_raytracing.cpp b/src/core/vk_raytracing.cpp index 970e0cc..e84416c 100644 --- a/src/core/vk_raytracing.cpp +++ b/src/core/vk_raytracing.cpp @@ -21,6 +21,12 @@ void RayTracingManager::init(DeviceManager *dev, ResourceManager *res) vkGetDeviceProcAddr(_device->device(), "vkCmdBuildAccelerationStructuresKHR")); _vkGetAccelerationStructureDeviceAddressKHR = reinterpret_cast( vkGetDeviceProcAddr(_device->device(), "vkGetAccelerationStructureDeviceAddressKHR")); + + // Query AS properties for scratch alignment + VkPhysicalDeviceAccelerationStructurePropertiesKHR asProps{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_PROPERTIES_KHR }; + VkPhysicalDeviceProperties2 props2{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, &asProps }; + vkGetPhysicalDeviceProperties2(_device->physicalDevice(), &props2); + _minScratchAlignment = std::max(asProps.minAccelerationStructureScratchOffsetAlignment, 256); } void RayTracingManager::cleanup() @@ -150,11 +156,15 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptrdevice(), &asci, nullptr, &blas.handle)); - AllocatedBuffer scratch = _resources->create_buffer(sizes.buildScratchSize, + // Allocate scratch with padding to satisfy alignment requirements + const VkDeviceSize align = _minScratchAlignment; + const VkDeviceSize padded = sizes.buildScratchSize + (align - 1); + AllocatedBuffer scratch = _resources->create_buffer(padded, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VMA_MEMORY_USAGE_GPU_ONLY); - VkDeviceAddress scratchAddr = get_buffer_address(_device->device(), scratch.buffer); + VkDeviceAddress scratchBase = get_buffer_address(_device->device(), scratch.buffer); + VkDeviceAddress scratchAddr = (scratchBase + (align - 1)) & ~VkDeviceAddress(align - 1); buildInfo.dstAccelerationStructure = blas.handle; buildInfo.scratchData.deviceAddress = scratchAddr; @@ -178,18 +188,20 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptrdevice(), _tlas.handle, nullptr); - _tlas.handle = VK_NULL_HANDLE; - } - if (_tlas.storage.buffer) - { - _resources->destroy_buffer(_tlas.storage); - _tlas.storage = {}; + AccelStructureHandle old = _tlas; + dq.push_function([this, old]() { + if (old.handle) + _vkDestroyAccelerationStructureKHR(_device->device(), old.handle, nullptr); + if (old.storage.buffer) + _resources->destroy_buffer(old.storage); + }); + _tlas = {}; } _tlas.storage = _resources->create_buffer(requiredASSize, VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | @@ -203,7 +215,7 @@ void RayTracingManager::ensure_tlas_storage(VkDeviceSize requiredASSize, VkDevic VK_CHECK(_vkCreateAccelerationStructureKHR(_device->device(), &asci, nullptr, &_tlas.handle)); } -VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const DrawContext &dc) +VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const DrawContext &dc, DeletionQueue& dq) { // Collect instances; one per render object (opaque only). std::vector instances; @@ -239,8 +251,19 @@ VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const Dra if (instances.empty()) { - // nothing to build - return _tlas.handle; + // No instances this frame: defer TLAS destruction to avoid racing with previous frames + if (_tlas.handle || _tlas.storage.buffer) + { + AccelStructureHandle old = _tlas; + dq.push_function([this, old]() { + if (old.handle) + _vkDestroyAccelerationStructureKHR(_device->device(), old.handle, nullptr); + if (old.storage.buffer) + _resources->destroy_buffer(old.storage); + }); + _tlas = {}; + } + return VK_NULL_HANDLE; } // Ensure instance buffer capacity @@ -293,15 +316,18 @@ VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const Dra _vkGetAccelerationStructureBuildSizesKHR(_device->device(), VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &buildInfo, &primCount, &sizes); - ensure_tlas_storage(sizes.accelerationStructureSize, sizes.buildScratchSize); + ensure_tlas_storage(sizes.accelerationStructureSize, sizes.buildScratchSize, dq); buildInfo.dstAccelerationStructure = _tlas.handle; - AllocatedBuffer scratch = _resources->create_buffer(sizes.buildScratchSize, + const VkDeviceSize align2 = _minScratchAlignment; + const VkDeviceSize padded2 = sizes.buildScratchSize + (align2 - 1); + AllocatedBuffer scratch = _resources->create_buffer(padded2, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VMA_MEMORY_USAGE_GPU_ONLY); - VkDeviceAddress scratchAddr = get_buffer_address(_device->device(), scratch.buffer); - buildInfo.scratchData.deviceAddress = scratchAddr; + VkDeviceAddress scratchBase2 = get_buffer_address(_device->device(), scratch.buffer); + VkDeviceAddress scratchAddr2 = (scratchBase2 + (align2 - 1)) & ~VkDeviceAddress(align2 - 1); + buildInfo.scratchData.deviceAddress = scratchAddr2; VkAccelerationStructureBuildRangeInfoKHR range{}; range.primitiveCount = primCount; diff --git a/src/core/vk_raytracing.h b/src/core/vk_raytracing.h index 53aec49..74d2ead 100644 --- a/src/core/vk_raytracing.h +++ b/src/core/vk_raytracing.h @@ -25,8 +25,9 @@ public: // Build (or get) BLAS for a mesh. Safe to call multiple times. AccelStructureHandle getOrBuildBLAS(const std::shared_ptr& mesh); - // Rebuild TLAS from current draw context; returns TLAS handle (or null if unavailable) - VkAccelerationStructureKHR buildTLASFromDrawContext(const DrawContext& dc); + // Rebuild TLAS from current draw context; returns TLAS handle (or null if unavailable) + // Destruction of previous TLAS resources is deferred via the provided frame deletion queue + VkAccelerationStructureKHR buildTLASFromDrawContext(const DrawContext& dc, DeletionQueue& frameDQ); VkAccelerationStructureKHR tlas() const { return _tlas.handle; } VkDeviceAddress tlasAddress() const { return _tlas.deviceAddress; } @@ -34,7 +35,7 @@ public: // Safe to call even if no BLAS exists for the buffer. void removeBLASForBuffer(VkBuffer vertexBuffer); - private: +private: // function pointers (resolved on init) PFN_vkCreateAccelerationStructureKHR _vkCreateAccelerationStructureKHR{}; PFN_vkDestroyAccelerationStructureKHR _vkDestroyAccelerationStructureKHR{}; @@ -42,17 +43,20 @@ public: PFN_vkCmdBuildAccelerationStructuresKHR _vkCmdBuildAccelerationStructuresKHR{}; PFN_vkGetAccelerationStructureDeviceAddressKHR _vkGetAccelerationStructureDeviceAddressKHR{}; - DeviceManager* _device{nullptr}; - ResourceManager* _resources{nullptr}; + DeviceManager* _device{nullptr}; + ResourceManager* _resources{nullptr}; // BLAS cache by vertex buffer handle std::unordered_map _blasByVB; - // TLAS + scratch / instance buffer (rebuilt per frame) - AccelStructureHandle _tlas{}; - AllocatedBuffer _tlasInstanceBuffer{}; - size_t _tlasInstanceCapacity{0}; + // TLAS + scratch / instance buffer (rebuilt per frame) + AccelStructureHandle _tlas{}; + AllocatedBuffer _tlasInstanceBuffer{}; + size_t _tlasInstanceCapacity{0}; + + // Properties + VkDeviceSize _minScratchAlignment{256}; - void ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize requiredScratch); - }; + void ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize requiredScratch, DeletionQueue& frameDQ); +}; diff --git a/src/render/rg_graph.cpp b/src/render/rg_graph.cpp index 28ca485..fa89964 100644 --- a/src/render/rg_graph.cpp +++ b/src/render/rg_graph.cpp @@ -809,13 +809,16 @@ void RenderGraph::add_present_chain(RGImageHandle sourceDraw, RGImageHandle RenderGraph::import_draw_image() { - RGImportedImageDesc d{}; - d.name = "drawImage"; - d.image = _context->getSwapchain()->drawImage().image; - d.imageView = _context->getSwapchain()->drawImage().imageView; - d.format = _context->getSwapchain()->drawImage().imageFormat; - d.extent = _context->getDrawExtent(); - d.currentLayout = VK_IMAGE_LAYOUT_GENERAL; + RGImportedImageDesc d{}; + d.name = "drawImage"; + d.image = _context->getSwapchain()->drawImage().image; + d.imageView = _context->getSwapchain()->drawImage().imageView; + d.format = _context->getSwapchain()->drawImage().imageFormat; + d.extent = _context->getDrawExtent(); + // Treat layout as unknown at frame start to force an explicit barrier + // into the first declared usage (compute write / color attach). This + // avoids mismatches when the previous frame ended in a different layout. + d.currentLayout = VK_IMAGE_LAYOUT_UNDEFINED; return import_image(d); } @@ -942,8 +945,10 @@ RGImageHandle RenderGraph::import_swapchain_image(uint32_t index) d.imageView = views[index]; d.format = _context->getSwapchain()->swapchainImageFormat(); d.extent = _context->getSwapchain()->swapchainExtent(); - d.currentLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - return import_image(d); + // On first use after swapchain creation, images are in UNDEFINED layout. + // Start from UNDEFINED so the graph inserts the necessary transition. + d.currentLayout = VK_IMAGE_LAYOUT_UNDEFINED; + return import_image(d); } void RenderGraph::resolve_timings() @@ -960,7 +965,7 @@ void RenderGraph::resolve_timings() _context->getDevice()->device(), _timestampPool, 0, queryCount, sizeof(uint64_t) * results.size(), results.data(), sizeof(uint64_t), - VK_QUERY_RESULT_64_BIT); + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); // Convert ticks to ms VkPhysicalDeviceProperties props{}; vkGetPhysicalDeviceProperties(_context->getDevice()->physicalDevice(), &props); @@ -983,6 +988,8 @@ void RenderGraph::resolve_timings() } } + // Ensure any pending work that might still reference the pool is complete + vkQueueWaitIdle(_context->getDevice()->graphicsQueue()); vkDestroyQueryPool(_context->getDevice()->device(), _timestampPool, nullptr); _timestampPool = VK_NULL_HANDLE; } diff --git a/src/render/rg_resources.cpp b/src/render/rg_resources.cpp index 22f6d27..d432642 100644 --- a/src/render/rg_resources.cpp +++ b/src/render/rg_resources.cpp @@ -1,8 +1,11 @@ #include #include #include +#include +#include #include "frame_resources.h" +#include "vk_device.h" void RGResourceRegistry::reset() { @@ -53,7 +56,13 @@ RGImageHandle RGResourceRegistry::add_transient(const RGImageDesc& d) rec.creationUsage = d.usage; VkExtent3D size{ d.extent.width, d.extent.height, 1 }; - rec.allocation = _ctx->getResources()->create_image(size, d.format, d.usage); + rec.allocation = _ctx->getResources()->create_image(size, d.format, d.usage); + // Name the allocation for diagnostics (optional) + if (vmaDebugEnabled() && _ctx && _ctx->getDevice()) + { + std::string nm = std::string("rg.image:") + d.name; + vmaSetAllocationName(_ctx->getDevice()->allocator(), rec.allocation.allocation, nm.c_str()); + } rec.image = rec.allocation.image; rec.imageView = rec.allocation.imageView; diff --git a/src/render/vk_materials.cpp b/src/render/vk_materials.cpp index ae75c0d..7113531 100644 --- a/src/render/vk_materials.cpp +++ b/src/render/vk_materials.cpp @@ -21,7 +21,8 @@ void GLTFMetallic_Roughness::build_pipelines(VulkanEngine *engine) layoutBuilder.add_binding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); materialLayout = layoutBuilder.build(engine->_deviceManager->device(), - VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT); + VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, + nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT); VkDescriptorSetLayout layouts[] = { engine->_descriptorManager->gpuSceneDataLayout(), diff --git a/src/render/vk_renderpass_background.cpp b/src/render/vk_renderpass_background.cpp index 2f245b0..5302b7b 100644 --- a/src/render/vk_renderpass_background.cpp +++ b/src/render/vk_renderpass_background.cpp @@ -94,6 +94,6 @@ void BackgroundPass::cleanup() _context->pipelines->destroyComputePipeline("gradient"); _context->pipelines->destroyComputePipeline("sky"); } - fmt::print("RenderPassManager::cleanup()\n"); + fmt::print("BackgroundPass::cleanup()\n"); _backgroundEffects.clear(); } diff --git a/src/render/vk_renderpass_lighting.cpp b/src/render/vk_renderpass_lighting.cpp index 2ce7b62..45f6569 100644 --- a/src/render/vk_renderpass_lighting.cpp +++ b/src/render/vk_renderpass_lighting.cpp @@ -30,7 +30,9 @@ void LightingPass::init(EngineContext *context) builder.add_binding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); builder.add_binding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); builder.add_binding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); - _gBufferInputDescriptorLayout = builder.build(_context->getDevice()->device(), VK_SHADER_STAGE_FRAGMENT_BIT); + _gBufferInputDescriptorLayout = builder.build( + _context->getDevice()->device(), VK_SHADER_STAGE_FRAGMENT_BIT, + nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT); } // Allocate and write GBuffer descriptor set @@ -51,21 +53,22 @@ void LightingPass::init(EngineContext *context) { DescriptorLayoutBuilder builder; builder.add_binding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, kShadowCascadeCount); - _shadowDescriptorLayout = builder.build(_context->getDevice()->device(), VK_SHADER_STAGE_FRAGMENT_BIT); + _shadowDescriptorLayout = builder.build( + _context->getDevice()->device(), VK_SHADER_STAGE_FRAGMENT_BIT, + nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT); } - // Build lighting pipeline through PipelineManager + // Build lighting pipelines (RT and non-RT) through PipelineManager VkDescriptorSetLayout layouts[] = { _context->getDescriptorLayouts()->gpuSceneDataLayout(), _gBufferInputDescriptorLayout, _shadowDescriptorLayout }; - GraphicsPipelineCreateInfo info{}; - info.vertexShaderPath = _context->getAssets()->shaderPath("fullscreen.vert.spv"); - info.fragmentShaderPath = _context->getAssets()->shaderPath("deferred_lighting.frag.spv"); - info.setLayouts.assign(std::begin(layouts), std::end(layouts)); - info.configure = [this](PipelineBuilder &b) { + GraphicsPipelineCreateInfo baseInfo{}; + baseInfo.vertexShaderPath = _context->getAssets()->shaderPath("fullscreen.vert.spv"); + baseInfo.setLayouts.assign(std::begin(layouts), std::end(layouts)); + baseInfo.configure = [this](PipelineBuilder &b) { b.set_input_topology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST); b.set_polygon_mode(VK_POLYGON_MODE_FILL); b.set_cull_mode(VK_CULL_MODE_NONE, VK_FRONT_FACE_CLOCKWISE); @@ -74,13 +77,16 @@ void LightingPass::init(EngineContext *context) b.disable_depthtest(); b.set_color_attachment_format(_context->getSwapchain()->drawImage().imageFormat); }; - _context->pipelines->createGraphicsPipeline("deferred_lighting", info); - // fetch the handles so current frame uses latest versions - MaterialPipeline mp{}; - _context->pipelines->getMaterialPipeline("deferred_lighting", mp); - _pipeline = mp.pipeline; - _pipelineLayout = mp.layout; + // Non-RT variant (no TLAS required) + auto infoNoRT = baseInfo; + infoNoRT.fragmentShaderPath = _context->getAssets()->shaderPath("deferred_lighting_nort.frag.spv"); + _context->pipelines->createGraphicsPipeline("deferred_lighting.nort", infoNoRT); + + // RT variant (requires GL_EXT_ray_query and TLAS bound at set=0,binding=1) + auto infoRT = baseInfo; + infoRT.fragmentShaderPath = _context->getAssets()->shaderPath("deferred_lighting.frag.spv"); + _context->pipelines->createGraphicsPipeline("deferred_lighting.rt", infoRT); _deletionQueue.push_function([&]() { // Pipelines are owned by PipelineManager; only destroy our local descriptor set layout @@ -145,8 +151,20 @@ void LightingPass::draw_lighting(VkCommandBuffer cmd, VkImageView drawView = resources.image_view(drawHandle); if (drawView == VK_NULL_HANDLE) return; - // Re-fetch pipeline in case it was hot-reloaded - pipelineManager->getGraphics("deferred_lighting", _pipeline, _pipelineLayout); + // Choose RT only if TLAS is valid; otherwise fall back to non-RT. + const bool haveRTFeatures = ctxLocal->getDevice()->supportsAccelerationStructure(); + const VkAccelerationStructureKHR tlas = (ctxLocal->ray ? ctxLocal->ray->tlas() : VK_NULL_HANDLE); + const VkDeviceAddress tlasAddr = (ctxLocal->ray ? ctxLocal->ray->tlasAddress() : 0); + const bool useRT = haveRTFeatures && (ctxLocal->shadowSettings.mode != 0u) && (tlas != VK_NULL_HANDLE) && (tlasAddr != 0); + + const char* pipeName = useRT ? "deferred_lighting.rt" : "deferred_lighting.nort"; + if (!pipelineManager->getGraphics(pipeName, _pipeline, _pipelineLayout)) + { + // Try the other variant as a fallback + const char* fallback = useRT ? "deferred_lighting.nort" : "deferred_lighting.rt"; + if (!pipelineManager->getGraphics(fallback, _pipeline, _pipelineLayout)) + return; // Neither pipeline is ready + } // Dynamic rendering is handled by the RenderGraph using the declared draw attachment. @@ -168,14 +186,10 @@ void LightingPass::draw_lighting(VkCommandBuffer cmd, deviceManager->device(), descriptorLayouts->gpuSceneDataLayout()); DescriptorWriter writer; writer.write_buffer(0, gpuSceneDataBuffer.buffer, sizeof(GPUSceneData), 0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); - // If TLAS available and feature enabled, bind it at (set=0,binding=1) - if (ctxLocal->ray && ctxLocal->getDevice()->supportsAccelerationStructure() && ctxLocal->shadowSettings.mode != 0u) + // Only write TLAS when using the RT pipeline and we have a valid TLAS + if (useRT) { - VkAccelerationStructureKHR tlas = ctxLocal->ray->tlas(); - if (tlas != VK_NULL_HANDLE) - { - writer.write_acceleration_structure(1, tlas); - } + writer.write_acceleration_structure(1, tlas); } writer.update_set(deviceManager->device(), globalDescriptor); diff --git a/src/render/vk_renderpass_shadow.cpp b/src/render/vk_renderpass_shadow.cpp index 0173a28..c90f9a8 100644 --- a/src/render/vk_renderpass_shadow.cpp +++ b/src/render/vk_renderpass_shadow.cpp @@ -47,6 +47,7 @@ void ShadowPass::init(EngineContext *context) b.set_multisampling_none(); b.disable_blending(); + // Keep reverse-Z convention for shadow maps to match engine depth usage b.enable_depthtest(true, VK_COMPARE_OP_GREATER_OR_EQUAL); b.set_depth_format(VK_FORMAT_D32_SFLOAT); diff --git a/src/scene/vk_loader.cpp b/src/scene/vk_loader.cpp index 713f6e1..10d9bab 100644 --- a/src/scene/vk_loader.cpp +++ b/src/scene/vk_loader.cpp @@ -6,6 +6,7 @@ #include "render/vk_materials.h" #include "core/vk_initializers.h" #include "core/vk_types.h" +#include "core/config.h" #include #include @@ -42,6 +43,9 @@ std::optional load_image(VulkanEngine *engine, fastgltf::Asset & VkFormat fmt = srgb ? VK_FORMAT_R8G8B8A8_SRGB : VK_FORMAT_R8G8B8A8_UNORM; newImage = engine->_resourceManager->create_image( data, imagesize, fmt, VK_IMAGE_USAGE_SAMPLED_BIT, false); + // Name the allocation for diagnostics + if (vmaDebugEnabled()) + vmaSetAllocationName(engine->_deviceManager->allocator(), newImage.allocation, path.c_str()); stbi_image_free(data); } @@ -59,6 +63,8 @@ std::optional load_image(VulkanEngine *engine, fastgltf::Asset & VkFormat fmt = srgb ? VK_FORMAT_R8G8B8A8_SRGB : VK_FORMAT_R8G8B8A8_UNORM; newImage = engine->_resourceManager->create_image( data, imagesize, fmt, VK_IMAGE_USAGE_SAMPLED_BIT, false); + if (vmaDebugEnabled()) + vmaSetAllocationName(engine->_deviceManager->allocator(), newImage.allocation, "gltf.vector.image"); stbi_image_free(data); } @@ -86,8 +92,10 @@ std::optional load_image(VulkanEngine *engine, fastgltf::Asset & imagesize.depth = 1; VkFormat fmt = srgb ? VK_FORMAT_R8G8B8A8_SRGB : VK_FORMAT_R8G8B8A8_UNORM; - newImage = engine->_resourceManager->create_image( + newImage = engine->_resourceManager->create_image( data, imagesize, fmt, VK_IMAGE_USAGE_SAMPLED_BIT, false); + if (vmaDebugEnabled()) + vmaSetAllocationName(engine->_deviceManager->allocator(), newImage.allocation, "gltf.bufferview.image"); stbi_image_free(data); } @@ -256,22 +264,33 @@ std::optional > loadGltf(VulkanEngine *engine, std:: //< load_arrays // load all textures - for (fastgltf::Image &image: gltf.images) + for (size_t i = 0; i < gltf.images.size(); ++i) { + fastgltf::Image &image = gltf.images[i]; // Default-load GLTF images as linear; baseColor is reloaded as sRGB when bound std::optional img = load_image(engine, gltf, image, false); if (img.has_value()) { images.push_back(*img); - file.images[image.name.c_str()] = *img; + // Use a unique, stable key so every allocation is tracked and later freed. + std::string key = image.name.empty() ? (std::string("gltf.image.") + std::to_string(i)) + : std::string(image.name.c_str()); + // Avoid accidental collisions from duplicate names + int suffix = 1; + while (file.images.find(key) != file.images.end()) + { + key = (image.name.empty() ? std::string("gltf.image.") + std::to_string(i) + : std::string(image.name.c_str())) + std::string("#") + std::to_string(suffix++); + } + file.images[key] = *img; } else { // we failed to load, so lets give the slot a default white texture to not // completely break loading images.push_back(engine->_errorCheckerboardImage); - std::cout << "gltf failed to load texture " << image.name << std::endl; + std::cout << "gltf failed to load texture index " << i << " (name='" << image.name << "')" << std::endl; } } diff --git a/src/scene/vk_scene.cpp b/src/scene/vk_scene.cpp index 730a32d..3265cad 100644 --- a/src/scene/vk_scene.cpp +++ b/src/scene/vk_scene.cpp @@ -198,6 +198,13 @@ std::shared_ptr SceneManager::getScene(const std::string &name) void SceneManager::cleanup() { + // Explicitly clear dynamic instances first to drop any extra shared_ptrs + // that could keep GPU resources alive. + clearMeshInstances(); + clearGLTFInstances(); + + // Drop our references to GLTF scenes. Their destructors call clearAll() + // exactly once to release GPU resources. loadedScenes.clear(); loadedNodes.clear(); }