FIX: Memory error fix, debug scheme

2025-11-01 01:21:41 +09:00
parent 235d9b2f83
commit d5ff6263ee
18 changed files with 609 additions and 95 deletions
--- a/shaders/deferred_lighting_nort.frag
+++ b/shaders/deferred_lighting_nort.frag
@@ -0,0 +1,273 @@
+#version 450
+#extension GL_GOOGLE_include_directive : require
+#include "input_structures.glsl"
+
+layout(location=0) in vec2 inUV;
+layout(location=0) out vec4 outColor;
+
+layout(set=1, binding=0) uniform sampler2D posTex;
+layout(set=1, binding=1) uniform sampler2D normalTex;
+layout(set=1, binding=2) uniform sampler2D albedoTex;
+layout(set=2, binding=0) uniform sampler2D shadowTex[4];
+
+// Tunables for shadow quality and blending
+// Border smoothing width in light-space NDC (0..1). Larger = wider cross-fade.
+const float SHADOW_BORDER_SMOOTH_NDC = 0.08;
+// Base PCF radius in texels for cascade 0; higher cascades scale this up slightly.
+const float SHADOW_PCF_BASE_RADIUS = 1.35;
+// Additional per-cascade radius scale for coarser cascades (0..1 factor added across levels)
+const float SHADOW_PCF_CASCADE_GAIN = 2.0; // extra radius at far end
+// Receiver normal-based offset to reduce acne (in world units)
+const float SHADOW_NORMAL_OFFSET = 0.0025;
+// Scale for receiver-plane depth bias term (tweak if over/under biased)
+const float SHADOW_RPDB_SCALE = 1.0;
+// Minimum clamp to keep a tiny bias even on perpendicular receivers
+const float SHADOW_MIN_BIAS = 1e-5;
+
+const float PI = 3.14159265359;
+
+float hash12(vec2 p)
+{
+    vec3 p3 = fract(vec3(p.xyx) * 0.1031);
+    p3 += dot(p3, p3.yzx + 33.33); return fract((p3.x + p3.y) * p3.z);
+}
+
+const vec2 POISSON_16[16] = vec2[16](
+vec2(0.2852, -0.1883), vec2(-0.1464, 0.2591),
+vec2(-0.3651, -0.0974), vec2(0.0901, 0.3807),
+vec2(0.4740, 0.0679), vec2(-0.0512, -0.4466),
+vec2(-0.4497, 0.1673), vec2(0.3347, 0.3211),
+vec2(0.1948, -0.4196), vec2(-0.2919, -0.3291),
+vec2(-0.0763, 0.4661), vec2(0.4421, -0.2217),
+vec2(0.0281, -0.2468), vec2(-0.2104, 0.0573),
+vec2(0.1197, 0.0779), vec2(-0.0905, -0.1203)
+);
+
+// Compute primary cascade and an optional neighbor for cross-fade near borders
+struct CascadeMix { uint i0; uint i1; float w1; };
+
+CascadeMix computeCascadeMix(vec3 worldPos)
+{
+    uint primary = 3u;
+    vec3 ndcP = vec3(0);
+    for (uint i = 0u; i < 4u; ++i)
+    {
+        vec4 lclip = sceneData.lightViewProjCascades[i] * vec4(worldPos, 1.0);
+        vec3 ndc = lclip.xyz / max(lclip.w, 1e-6);
+        if (abs(ndc.x) <= 1.0 && abs(ndc.y) <= 1.0 && ndc.z >= 0.0 && ndc.z <= 1.0)
+        {
+            primary = i;
+            ndcP = ndc;
+            break;
+        }
+    }
+
+    CascadeMix cm; cm.i0 = primary; cm.i1 = primary; cm.w1 = 0.0;
+
+    if (primary < 3u)
+    {
+        float edge = max(abs(ndcP.x), abs(ndcP.y)); // 0..1, 1 at border
+        // start blending when we are within S of the border
+        float t = clamp((edge - (1.0 - SHADOW_BORDER_SMOOTH_NDC)) / max(SHADOW_BORDER_SMOOTH_NDC, 1e-4), 0.0, 1.0);
+        float w = smoothstep(0.0, 1.0, t);
+
+        if (w > 0.0)
+        {
+            // Only blend if neighbor actually covers the point
+            uint neighbor = primary + 1u;
+            vec4 lclipN = sceneData.lightViewProjCascades[neighbor] * vec4(worldPos, 1.0);
+            vec3 ndcN = lclipN.xyz / max(lclipN.w, 1e-6);
+            bool insideN = (abs(ndcN.x) <= 1.0 && abs(ndcN.y) <= 1.0 && ndcN.z >= 0.0 && ndcN.z <= 1.0);
+            if (insideN)
+            {
+                cm.i1 = neighbor;
+                cm.w1 = w;
+            }
+        }
+    }
+
+    return cm;
+}
+
+// Compute receiver-plane depth gradient dz/duv using derivatives of shadow NDC
+// Reference: Akenine-Möller et al., "Receiver Plane Depth Bias" (PCF-friendly)
+vec2 receiverPlaneDepthGradient(vec3 ndc, vec3 dndc_dx, vec3 dndc_dy)
+{
+    // Convert XY to shadow map UV derivatives (ndc -> uv: u = 0.5*x + 0.5)
+    vec2 duv_dx = 0.5 * dndc_dx.xy;
+    vec2 duv_dy = 0.5 * dndc_dy.xy;
+
+    // Build Jacobian J = [du/dx du/dy; dv/dx dv/dy] (column-major)
+    mat2 J = mat2(duv_dx.x, duv_dy.x,
+    duv_dx.y, duv_dy.y);
+
+    // Depth derivatives w.r.t screen pixels
+    vec2 dz_dxdy = vec2(dndc_dx.z, dndc_dy.z);
+
+    // Invert J to obtain dz/du and dz/dv. Guard against near-singular Jacobian.
+    float det = J[0][0] * J[1][1] - J[1][0] * J[0][1];
+    if (abs(det) < 1e-8)
+    {
+        // Degenerate mapping; return zero gradient so only slope/const bias applies
+        return vec2(0.0);
+    }
+
+    // Manual inverse for stability/perf on some drivers
+    mat2 invJ = (1.0 / det) * mat2( J[1][1], -J[0][1],
+    -J[1][0],  J[0][0]);
+    return invJ * dz_dxdy; // (dz/du, dz/dv)
+}
+
+float sampleCascadeShadow(uint ci, vec3 worldPos, vec3 N, vec3 L)
+{
+    mat4 lightMat = sceneData.lightViewProjCascades[ci];
+
+    vec4 lclip = lightMat * vec4(worldPos, 1.0);
+    vec3 ndc  = lclip.xyz / lclip.w;
+    vec2 suv  = ndc.xy * 0.5 + 0.5;
+
+    if (any(lessThan(suv, vec2(0.0))) || any(greaterThan(suv, vec2(1.0))))
+    return 1.0;
+
+    float current = clamp(ndc.z, 0.0, 1.0);
+
+    // Slope-based tiny baseline bias (cheap safety net)
+    float NoL       = max(dot(N, L), 0.0);
+    float slopeBias = max(0.0006 * (1.0 - NoL), SHADOW_MIN_BIAS);
+
+    // Receiver-plane depth gradient in shadow UV space
+    vec3 dndc_dx = dFdx(ndc);
+    vec3 dndc_dy = dFdy(ndc);
+    vec2 dz_duv  = receiverPlaneDepthGradient(ndc, dndc_dx, dndc_dy);
+
+    ivec2 dim       = textureSize(shadowTex[ci], 0);
+    vec2  texelSize = 1.0 / vec2(dim);
+
+    float baseRadius = SHADOW_PCF_BASE_RADIUS;
+    float radius     = mix(baseRadius, baseRadius + SHADOW_PCF_CASCADE_GAIN, float(ci) / 3.0);
+
+    float ang = hash12(suv * 4096.0) * 6.2831853;
+    vec2  r   = vec2(cos(ang), sin(ang));
+    mat2  rot = mat2(r.x, -r.y, r.y, r.x);
+
+    const int TAP_COUNT = 16;
+    float visible = 0.0;
+    float wsum    = 0.0;
+
+    for (int i = 0; i < TAP_COUNT; ++i)
+    {
+        vec2  pu   = rot * POISSON_16[i];
+        vec2  off  = pu * radius * texelSize; // uv-space offset of this tap
+
+        float pr   = length(pu);
+        float w    = 1.0 - smoothstep(0.0, 0.65, pr);
+
+        float mapD = texture(shadowTex[ci], suv + off).r;
+
+        // Receiver-plane depth bias: conservative depth delta over this tap's offset
+        // Approximate |Δz| ≈ |dz/du|*|Δu| + |dz/dv|*|Δv|
+        float rpdb = dot(abs(dz_duv), abs(off)) * SHADOW_RPDB_SCALE;
+
+        float vis  = step(mapD, current + slopeBias + rpdb);
+
+        visible += vis * w;
+        wsum    += w;
+    }
+
+    float visibility = (wsum > 0.0) ? (visible / wsum) : 1.0;
+    return visibility;
+}
+
+float calcShadowVisibility(vec3 worldPos, vec3 N, vec3 L)
+{
+    vec3 wp = worldPos + N * SHADOW_NORMAL_OFFSET * (0.5 + 0.5 * (1.0 - max(dot(N, L), 0.0)));
+
+    CascadeMix cm = computeCascadeMix(wp);
+    float v0 = sampleCascadeShadow(cm.i0, wp, N, L);
+    if (cm.w1 <= 0.0)
+    return v0;
+
+    float v1 = sampleCascadeShadow(cm.i1, wp, N, L);
+    return mix(v0, v1, clamp(cm.w1, 0.0, 1.0));
+}
+
+vec3 fresnelSchlick(float cosTheta, vec3 F0)
+{
+    return F0 + (1.0 - F0) * pow(1.0 - cosTheta, 5.0);
+}
+
+float DistributionGGX(vec3 N, vec3 H, float roughness)
+{
+    float a      = roughness * roughness;
+    float a2     = a * a;
+    float NdotH  = max(dot(N, H), 0.0);
+    float NdotH2 = NdotH * NdotH;
+
+    float num   = a2;
+    float denom = (NdotH2 * (a2 - 1.0) + 1.0);
+    denom = PI * denom * denom;
+
+    return num / max(denom, 0.001);
+}
+
+float GeometrySchlickGGX(float NdotV, float roughness)
+{
+    float r = (roughness + 1.0);
+    float k = (r * r) / 8.0;
+
+    float denom = NdotV * (1.0 - k) + k;
+    return NdotV / max(denom, 0.001);
+}
+
+float GeometrySmith(vec3 N, vec3 V, vec3 L, float roughness)
+{
+    float ggx2 = GeometrySchlickGGX(max(dot(N, V), 0.0), roughness);
+    float ggx1 = GeometrySchlickGGX(max(dot(N, L), 0.0), roughness);
+    return ggx1 * ggx2;
+}
+
+void main(){
+    vec4 posSample = texture(posTex, inUV);
+    if (posSample.w == 0.0)
+    {
+        outColor = vec4(0.0);
+        return;
+    }
+
+    vec3 pos = posSample.xyz;
+    vec4 normalSample = texture(normalTex, inUV);
+    vec3 N = normalize(normalSample.xyz);
+    float roughness = clamp(normalSample.w, 0.04, 1.0);
+
+    vec4 albedoSample = texture(albedoTex, inUV);
+    vec3 albedo = albedoSample.rgb;
+    float metallic = clamp(albedoSample.a, 0.0, 1.0);
+
+    vec3 camPos = vec3(inverse(sceneData.view)[3]);
+    vec3 V = normalize(camPos - pos);
+    vec3 L = normalize(-sceneData.sunlightDirection.xyz);
+    vec3 H = normalize(V + L);
+
+    vec3 F0 = mix(vec3(0.04), albedo, metallic);
+    vec3 F  = fresnelSchlick(max(dot(H, V), 0.0), F0);
+    float NDF = DistributionGGX(N, H, roughness);
+    float G   = GeometrySmith(N, V, L, roughness);
+
+    vec3 numerator    = NDF * G * F;
+    float denom       = 4.0 * max(dot(N, V), 0.0) * max(dot(N, L), 0.0);
+    vec3 specular     = numerator / max(denom, 0.001);
+
+    vec3 kS = F;
+    vec3 kD = (1.0 - kS) * (1.0 - metallic);
+
+    float NdotL = max(dot(N, L), 0.0);
+    // Shadowing (directional, forward-Z shadow map)
+    float visibility = calcShadowVisibility(pos, N, L);
+
+    vec3 irradiance = sceneData.sunlightColor.rgb * sceneData.sunlightColor.a * NdotL * visibility;
+
+    vec3 color = (kD * albedo / PI + specular) * irradiance;
+    color += albedo * sceneData.ambientColor.rgb;
+
+    outColor = vec4(color, 1.0);
+}
--- a/src/compute/vk_compute.cpp
+++ b/src/compute/vk_compute.cpp
@@ -6,6 +6,7 @@

 #include "vk_device.h"
 #include "core/vk_resource.h"
+#include "frame_resources.h"

 ComputeBinding ComputeBinding::uniformBuffer(uint32_t binding, VkBuffer buffer, VkDeviceSize size, VkDeviceSize offset)
 {
@@ -354,9 +355,22 @@ void ComputeManager::dispatchInstance(VkCommandBuffer cmd, const std::string &in

    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipeline());

-    updateDescriptorSet(it->second.descriptorSet, it->second.bindings);
-    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getLayout(), 0, 1, &it->second.descriptorSet,
-                            0, nullptr);
+    // Allocate a transient per-frame descriptor set to avoid updating a set
+    // that might still be in use by a previous in-flight frame.
+    VkDescriptorSet transientSet = context->currentFrame
+        ? context->currentFrame->_frameDescriptors.allocate(context->getDevice()->device(), pipeline.descriptorLayout)
+        : VK_NULL_HANDLE;
+    if (transientSet == VK_NULL_HANDLE)
+    {
+        // Fallback to instance-owned set if per-frame allocator unavailable
+        updateDescriptorSet(it->second.descriptorSet, it->second.bindings);
+        vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getLayout(), 0, 1, &it->second.descriptorSet, 0, nullptr);
+    }
+    else
+    {
+        updateDescriptorSet(transientSet, it->second.bindings);
+        vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getLayout(), 0, 1, &transientSet, 0, nullptr);
+    }

    if (dispatchInfo.pushConstants && dispatchInfo.pushConstantSize > 0)
    {
@@ -459,9 +473,22 @@ bool ComputeManager::createPipeline(const std::string &name, const ComputePipeli
        DescriptorLayoutBuilder layoutBuilder;
        for (size_t i = 0; i < createInfo.descriptorTypes.size(); ++i)
        {
-            layoutBuilder.add_binding(i, createInfo.descriptorTypes[i]);
+            layoutBuilder.add_binding(static_cast<uint32_t>(i), createInfo.descriptorTypes[i]);
        }
-        computePipeline.descriptorLayout = layoutBuilder.build(context->getDevice()->device(), VK_SHADER_STAGE_COMPUTE_BIT);
+
+        // Mark all compute bindings as UPDATE_AFTER_BIND so we can update
+        // persistent instance descriptor sets while a previous frame is in-flight.
+        std::vector<VkDescriptorBindingFlags> bindingFlags(createInfo.descriptorTypes.size(),
+                                                          VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT);
+        VkDescriptorSetLayoutBindingFlagsCreateInfo flagsCI{ VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO };
+        flagsCI.bindingCount = static_cast<uint32_t>(bindingFlags.size());
+        flagsCI.pBindingFlags = bindingFlags.data();
+
+        computePipeline.descriptorLayout = layoutBuilder.build(
+            context->getDevice()->device(),
+            VK_SHADER_STAGE_COMPUTE_BIT,
+            &flagsCI,
+            VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT);
    }

    VkPipelineLayoutCreateInfo layoutInfo = vkinit::pipeline_layout_create_info();
--- a/src/core/config.h
+++ b/src/core/config.h
@@ -7,6 +7,22 @@ inline constexpr bool kUseValidationLayers = false;
 inline constexpr bool kUseValidationLayers = true;
 #endif

+// VMA diagnostics (stats prints + JSON dumps + allocation naming)
+// - Default: disabled to avoid noise and I/O at shutdown.
+// - Enable at runtime by setting environment variable `VE_VMA_DEBUG=1`.
+#include <cstdlib>
+inline constexpr bool kEnableVmaDebugByDefault = false;
+inline bool vmaDebugEnabled()
+{
+    const char *env = std::getenv("VE_VMA_DEBUG");
+    if (env && *env)
+    {
+        // Accept 1/true/yes (case-insensitive)
+        return (*env == '1') || (*env == 'T') || (*env == 't') || (*env == 'Y') || (*env == 'y');
+    }
+    return kEnableVmaDebugByDefault;
+}
+
 // Shadow mapping configuration
 inline constexpr int kShadowCascadeCount = 4;
 // Maximum shadow distance for CSM in view-space units
@@ -22,9 +38,9 @@ inline constexpr float kShadowCascadeRadiusMargin = 10.0f;
 inline constexpr float kShadowClipBaseRadius = 20.0f;
 // When using dynamic pullback, compute it from the covered XY range of each level.
 // pullback = max(kShadowClipPullbackMin, cover * kShadowClipPullbackFactor)
-inline constexpr float kShadowClipPullbackFactor = 2.5f;   // fraction of XY half-size behind center
-inline constexpr float kShadowClipForwardFactor  = 2.5f;   // fraction of XY half-size in front of center for zFar
-inline constexpr float kShadowClipPullbackMin    = 160.0f;   // lower bound on pullback so near levels don’t collapse
+inline constexpr float kShadowClipPullbackFactor = 1.5f;   // fraction of XY half-size behind center
+inline constexpr float kShadowClipForwardFactor  = 1.5f;   // fraction of XY half-size in front of center for zFar
+inline constexpr float kShadowClipPullbackMin    = 40.0f;   // lower bound on pullback so near levels don’t collapse
 // Additional Z padding for the orthographic frustum along light direction
 inline constexpr float kShadowClipZPadding = 40.0f;

--- a/src/core/vk_descriptor_manager.cpp
+++ b/src/core/vk_descriptor_manager.cpp
@@ -9,7 +9,9 @@ void DescriptorManager::init(DeviceManager *deviceManager)
    {
        DescriptorLayoutBuilder builder;
        builder.add_binding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
-        _singleImageDescriptorLayout = builder.build(_deviceManager->device(), VK_SHADER_STAGE_FRAGMENT_BIT);
+        _singleImageDescriptorLayout = builder.build(
+            _deviceManager->device(), VK_SHADER_STAGE_FRAGMENT_BIT,
+            nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT);
    } {
        DescriptorLayoutBuilder builder;
        builder.add_binding(0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
@@ -19,7 +21,8 @@ void DescriptorManager::init(DeviceManager *deviceManager)
            builder.add_binding(1, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
        }
        _gpuSceneDataDescriptorLayout = builder.build(
-            _deviceManager->device(), VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT);
+            _deviceManager->device(), VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT,
+            nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT);
    }
 }

--- a/src/core/vk_descriptors.cpp
+++ b/src/core/vk_descriptors.cpp
@@ -77,10 +77,13 @@ void DescriptorWriter::write_image(int binding, VkImageView image, VkSampler sam

 void DescriptorWriter::write_acceleration_structure(int binding, VkAccelerationStructureKHR as)
 {
+    // Store the handle to ensure the pointer we give to Vulkan stays valid
+    VkAccelerationStructureKHR &storedAS = accelHandles.emplace_back(as);
+
    VkWriteDescriptorSetAccelerationStructureKHR &acc = accelInfos.emplace_back(
        VkWriteDescriptorSetAccelerationStructureKHR{ VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR });
    acc.accelerationStructureCount = 1;
-    acc.pAccelerationStructures = &as;
+    acc.pAccelerationStructures = &storedAS;

    VkWriteDescriptorSet write{ VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET };
    write.dstBinding = binding;
@@ -95,6 +98,8 @@ void DescriptorWriter::clear()
    imageInfos.clear();
    writes.clear();
    bufferInfos.clear();
+    accelInfos.clear();
+    accelHandles.clear();
 }

 void DescriptorWriter::update_set(VkDevice device, VkDescriptorSet set)
@@ -118,7 +123,10 @@ void DescriptorAllocator::init_pool(VkDevice device, uint32_t maxSets, std::span
    }

    VkDescriptorPoolCreateInfo pool_info = {.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO};
-    pool_info.flags = 0;
+    // Enable update-after-bind so descriptors used by previous frame can be
+    // safely rewritten (e.g., compute instances). It is valid to allocate
+    // non-update-after-bind sets from such a pool.
+    pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT;
    pool_info.maxSets = maxSets;
    pool_info.poolSizeCount = (uint32_t) poolSizes.size();
    pool_info.pPoolSizes = poolSizes.data();
@@ -187,7 +195,8 @@ VkDescriptorPool DescriptorAllocatorGrowable::create_pool(VkDevice device, uint3

    VkDescriptorPoolCreateInfo pool_info = {};
    pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
-    pool_info.flags = 0;
+    // Use update-after-bind pools to support cross-frame rewrites.
+    pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT;
    pool_info.maxSets = setCount;
    pool_info.poolSizeCount = (uint32_t) poolSizes.size();
    pool_info.pPoolSizes = poolSizes.data();
--- a/src/core/vk_descriptors.h
+++ b/src/core/vk_descriptors.h
@@ -20,6 +20,8 @@ struct DescriptorWriter
    std::deque<VkDescriptorImageInfo> imageInfos;
    std::deque<VkDescriptorBufferInfo> bufferInfos;
    std::deque<VkWriteDescriptorSetAccelerationStructureKHR> accelInfos;
+    // Keep AS handles alive so pAccelerationStructures points to valid memory
+    std::deque<VkAccelerationStructureKHR> accelHandles;
    std::vector<VkWriteDescriptorSet> writes;

    void write_image(int binding, VkImageView image, VkSampler sampler, VkImageLayout layout, VkDescriptorType type);
--- a/src/core/vk_device.cpp
+++ b/src/core/vk_device.cpp
@@ -30,8 +30,16 @@ void DeviceManager::init_vulkan(SDL_Window *window)
    features.synchronization2 = true;

    VkPhysicalDeviceVulkan12Features features12{.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES};
-    features12.bufferDeviceAddress = true;
-    features12.descriptorIndexing = true;
+    features12.bufferDeviceAddress = VK_TRUE;
+    features12.descriptorIndexing = VK_TRUE;
+    // Enable update-after-bind related toggles for graphics/compute descriptors
+    features12.descriptorBindingPartiallyBound = VK_TRUE;
+    features12.descriptorBindingUpdateUnusedWhilePending = VK_TRUE;
+    features12.runtimeDescriptorArray = VK_TRUE;
+    features12.descriptorBindingUniformBufferUpdateAfterBind = VK_TRUE;
+    features12.descriptorBindingStorageBufferUpdateAfterBind = VK_TRUE;
+    features12.descriptorBindingSampledImageUpdateAfterBind = VK_TRUE;
+    features12.descriptorBindingStorageImageUpdateAfterBind = VK_TRUE;

    //use vkbootstrap to select a gpu.
    //We want a gpu that can write to the SDL surface and supports vulkan 1.3
@@ -72,14 +80,16 @@ void DeviceManager::init_vulkan(SDL_Window *window)
    //create the final vulkan device
    vkb::DeviceBuilder deviceBuilder{physicalDevice};

-    // Enable ray query + accel struct features in device create pNext if supported
+    // Ray features are optional and enabled only if supported on the chosen GPU
+    VkPhysicalDeviceAccelerationStructureFeaturesKHR accelReq{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR };
+    VkPhysicalDeviceRayQueryFeaturesKHR rayqReq{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR };
    if (_rayQuerySupported && _accelStructSupported)
    {
-        VkPhysicalDeviceAccelerationStructureFeaturesKHR accelReq{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR };
        accelReq.accelerationStructure = VK_TRUE;
-        VkPhysicalDeviceRayQueryFeaturesKHR rayqReq{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR };
-        rayqReq.pNext = &accelReq;
        rayqReq.rayQuery = VK_TRUE;
+        rayqReq.pNext = &accelReq;
+    }
+    if (_rayQuerySupported && _accelStructSupported) {
        deviceBuilder.add_pNext(&rayqReq);
    }

@@ -111,6 +121,18 @@ void DeviceManager::init_vulkan(SDL_Window *window)

 void DeviceManager::cleanup()
 {
+    // Optional VMA stats print
+    if (_allocator && vmaDebugEnabled())
+    {
+        VmaTotalStatistics stats{};
+        vmaCalculateStatistics(_allocator, &stats);
+        const VmaStatistics& s = stats.total.statistics;
+        fmt::print("[VMA] Blocks: {} | Allocations: {} | BlockBytes: {} | AllocationBytes: {}\n",
+                   (size_t)s.blockCount,
+                   (size_t)s.allocationCount,
+                   (unsigned long long)s.blockBytes,
+                   (unsigned long long)s.allocationBytes);
+    }
    vkDestroySurfaceKHR(_instance, _surface, nullptr);
    _deletionQueue.flush();
    vkDestroyDevice(_device, nullptr);
--- a/src/core/vk_engine.cpp
+++ b/src/core/vk_engine.cpp
@@ -53,6 +53,46 @@

 VulkanEngine *loadedEngine = nullptr;

+static void print_vma_stats(DeviceManager* dev, const char* tag)
+{
+    if (!vmaDebugEnabled()) return;
+    if (!dev) return;
+    VmaAllocator alloc = dev->allocator();
+    if (!alloc) return;
+    VmaTotalStatistics stats{};
+    vmaCalculateStatistics(alloc, &stats);
+    const VmaStatistics &s = stats.total.statistics;
+    fmt::print("[VMA][{}] Blocks:{} Allocs:{} BlockBytes:{} AllocBytes:{}\n",
+               tag,
+               (size_t)s.blockCount,
+               (size_t)s.allocationCount,
+               (unsigned long long)s.blockBytes,
+               (unsigned long long)s.allocationBytes);
+}
+
+static void dump_vma_json(DeviceManager* dev, const char* tag)
+{
+    if (!vmaDebugEnabled()) return;
+    if (!dev) return;
+    VmaAllocator alloc = dev->allocator();
+    if (!alloc) return;
+    char* json = nullptr;
+    vmaBuildStatsString(alloc, &json, VK_TRUE);
+    if (json)
+    {
+        // Write to a small temp file beside the binary
+        std::string fname = std::string("vma_") + tag + ".json";
+        FILE* f = fopen(fname.c_str(), "wb");
+        if (f)
+        {
+            fwrite(json, 1, strlen(json), f);
+            fclose(f);
+            fmt::print("[VMA] Wrote {}\n", fname);
+        }
+        vmaFreeStatsString(alloc, json);
+    }
+}
+
 void VulkanEngine::init()
 {
    // We initialize SDL and create a window with it.
@@ -150,7 +190,7 @@ void VulkanEngine::init()
    auto imguiPass = std::make_unique<ImGuiPass>();
    _renderPassManager->setImGuiPass(std::move(imguiPass));

-    const std::string structurePath = _assetManager->modelPath("police_office.glb");
+    const std::string structurePath = _assetManager->modelPath("seoul_high.glb");
    const auto structureFile = _assetManager->loadGLTF(structurePath);

    assert(structureFile.has_value());
@@ -233,7 +273,11 @@ void VulkanEngine::cleanup()
 {
    vkDeviceWaitIdle(_deviceManager->device());

+    print_vma_stats(_deviceManager.get(), "begin");
+
    _sceneManager->cleanup();
+    print_vma_stats(_deviceManager.get(), "after SceneManager");
+    dump_vma_json(_deviceManager.get(), "after_SceneManager");

    if (_isInitialized)
    {
@@ -253,24 +297,53 @@ void VulkanEngine::cleanup()
        metalRoughMaterial.clear_resources(_deviceManager->device());

        _mainDeletionQueue.flush();
+        print_vma_stats(_deviceManager.get(), "after MainDQ flush");
+        dump_vma_json(_deviceManager.get(), "after_MainDQ");

    _renderPassManager->cleanup();
+        print_vma_stats(_deviceManager.get(), "after RenderPassManager");
+        dump_vma_json(_deviceManager.get(), "after_RenderPassManager");

        _pipelineManager->cleanup();
+        print_vma_stats(_deviceManager.get(), "after PipelineManager");
+        dump_vma_json(_deviceManager.get(), "after_PipelineManager");

        compute.cleanup();
+        print_vma_stats(_deviceManager.get(), "after Compute");
+        dump_vma_json(_deviceManager.get(), "after_Compute");

        _swapchainManager->cleanup();
+        print_vma_stats(_deviceManager.get(), "after Swapchain");
+        dump_vma_json(_deviceManager.get(), "after_Swapchain");

        if (_assetManager) _assetManager->cleanup();
+        print_vma_stats(_deviceManager.get(), "after AssetManager");
+        dump_vma_json(_deviceManager.get(), "after_AssetManager");
+
+        // Ensure ray tracing resources (BLAS/TLAS/instance buffers) are freed before VMA is destroyed
+        if (_rayManager) { _rayManager->cleanup(); }
+        print_vma_stats(_deviceManager.get(), "after RTManager");
+        dump_vma_json(_deviceManager.get(), "after_RTManager");

        _resourceManager->cleanup();
+        print_vma_stats(_deviceManager.get(), "after ResourceManager");
+        dump_vma_json(_deviceManager.get(), "after_ResourceManager");

        _samplerManager->cleanup();
        _descriptorManager->cleanup();
+        print_vma_stats(_deviceManager.get(), "after Samplers+Descriptors");
+        dump_vma_json(_deviceManager.get(), "after_Samplers_Descriptors");

        _context->descriptors->destroy_pools(_deviceManager->device());

+        // Extra safety: flush frame deletion queues once more before destroying VMA
+        for (int i = 0; i < FRAME_OVERLAP; i++)
+        {
+            _frames[i]._deletionQueue.flush();
+        }
+
+        print_vma_stats(_deviceManager.get(), "before DeviceManager");
+        dump_vma_json(_deviceManager.get(), "before_DeviceManager");
        _deviceManager->cleanup();

        SDL_DestroyWindow(_window);
@@ -280,11 +353,6 @@ void VulkanEngine::cleanup()
 void VulkanEngine::draw()
 {
    _sceneManager->update_scene();
-    // Build or update TLAS for current frame if RT mode enabled (1 or 2)
-    if (_rayManager && _context->shadowSettings.mode != 0u)
-    {
-        _rayManager->buildTLASFromDrawContext(_context->getMainDrawContext());
-    }
    //> frame_clear
    //wait until the gpu has finished rendering the last frame. Timeout of 1 second
    VK_CHECK(vkWaitForFences(_deviceManager->device(), 1, &get_current_frame()._renderFence, true, 1000000000));
@@ -319,6 +387,12 @@ void VulkanEngine::draw()
    //now that we are sure that the commands finished executing, we can safely reset the command buffer to begin recording again.
    VK_CHECK(vkResetCommandBuffer(get_current_frame()._mainCommandBuffer, 0));

+    // Build or update TLAS for current frame now that the previous frame is idle
+    if (_rayManager && _context->shadowSettings.mode != 0u)
+    {
+        _rayManager->buildTLASFromDrawContext(_context->getMainDrawContext(), get_current_frame()._deletionQueue);
+    }
+
    //naming it cmd for shorter writing
    VkCommandBuffer cmd = get_current_frame()._mainCommandBuffer;

--- a/src/core/vk_raytracing.cpp
+++ b/src/core/vk_raytracing.cpp
@@ -21,6 +21,12 @@ void RayTracingManager::init(DeviceManager *dev, ResourceManager *res)
        vkGetDeviceProcAddr(_device->device(), "vkCmdBuildAccelerationStructuresKHR"));
    _vkGetAccelerationStructureDeviceAddressKHR = reinterpret_cast<PFN_vkGetAccelerationStructureDeviceAddressKHR>(
        vkGetDeviceProcAddr(_device->device(), "vkGetAccelerationStructureDeviceAddressKHR"));
+
+    // Query AS properties for scratch alignment
+    VkPhysicalDeviceAccelerationStructurePropertiesKHR asProps{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_PROPERTIES_KHR };
+    VkPhysicalDeviceProperties2 props2{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, &asProps };
+    vkGetPhysicalDeviceProperties2(_device->physicalDevice(), &props2);
+    _minScratchAlignment = std::max<VkDeviceSize>(asProps.minAccelerationStructureScratchOffsetAlignment, 256);
 }

 void RayTracingManager::cleanup()
@@ -150,11 +156,15 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptr<Mes
    asci.size = sizes.accelerationStructureSize;
    VK_CHECK(_vkCreateAccelerationStructureKHR(_device->device(), &asci, nullptr, &blas.handle));

-    AllocatedBuffer scratch = _resources->create_buffer(sizes.buildScratchSize,
+    // Allocate scratch with padding to satisfy alignment requirements
+    const VkDeviceSize align = _minScratchAlignment;
+    const VkDeviceSize padded = sizes.buildScratchSize + (align - 1);
+    AllocatedBuffer scratch = _resources->create_buffer(padded,
                                                        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                                        VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                                        VMA_MEMORY_USAGE_GPU_ONLY);
-    VkDeviceAddress scratchAddr = get_buffer_address(_device->device(), scratch.buffer);
+    VkDeviceAddress scratchBase = get_buffer_address(_device->device(), scratch.buffer);
+    VkDeviceAddress scratchAddr = (scratchBase + (align - 1)) & ~VkDeviceAddress(align - 1);

    buildInfo.dstAccelerationStructure = blas.handle;
    buildInfo.scratchData.deviceAddress = scratchAddr;
@@ -178,18 +188,20 @@ AccelStructureHandle RayTracingManager::getOrBuildBLAS(const std::shared_ptr<Mes
    return blas;
 }

-void RayTracingManager::ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize /*requiredScratch*/)
+void RayTracingManager::ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize /*requiredScratch*/, DeletionQueue& dq)
 {
-    // Simple: recreate TLAS storage if size grows
-    if (_tlas.handle)
+    // Recreate TLAS storage if size grows. Defer destruction to the frame DQ to
+    // avoid freeing while referenced by in-flight frames.
+    if (_tlas.handle || _tlas.storage.buffer)
    {
-        _vkDestroyAccelerationStructureKHR(_device->device(), _tlas.handle, nullptr);
-        _tlas.handle = VK_NULL_HANDLE;
-    }
-    if (_tlas.storage.buffer)
-    {
-        _resources->destroy_buffer(_tlas.storage);
-        _tlas.storage = {};
+        AccelStructureHandle old = _tlas;
+        dq.push_function([this, old]() {
+            if (old.handle)
+                _vkDestroyAccelerationStructureKHR(_device->device(), old.handle, nullptr);
+            if (old.storage.buffer)
+                _resources->destroy_buffer(old.storage);
+        });
+        _tlas = {};
    }
    _tlas.storage = _resources->create_buffer(requiredASSize,
                                              VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR |
@@ -203,7 +215,7 @@ void RayTracingManager::ensure_tlas_storage(VkDeviceSize requiredASSize, VkDevic
    VK_CHECK(_vkCreateAccelerationStructureKHR(_device->device(), &asci, nullptr, &_tlas.handle));
 }

-VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const DrawContext &dc)
+VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const DrawContext &dc, DeletionQueue& dq)
 {
    // Collect instances; one per render object (opaque only).
    std::vector<VkAccelerationStructureInstanceKHR> instances;
@@ -239,8 +251,19 @@ VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const Dra

    if (instances.empty())
    {
-        // nothing to build
-        return _tlas.handle;
+        // No instances this frame: defer TLAS destruction to avoid racing with previous frames
+        if (_tlas.handle || _tlas.storage.buffer)
+        {
+            AccelStructureHandle old = _tlas;
+            dq.push_function([this, old]() {
+                if (old.handle)
+                    _vkDestroyAccelerationStructureKHR(_device->device(), old.handle, nullptr);
+                if (old.storage.buffer)
+                    _resources->destroy_buffer(old.storage);
+            });
+            _tlas = {};
+        }
+        return VK_NULL_HANDLE;
    }

    // Ensure instance buffer capacity
@@ -293,15 +316,18 @@ VkAccelerationStructureKHR RayTracingManager::buildTLASFromDrawContext(const Dra
    _vkGetAccelerationStructureBuildSizesKHR(_device->device(), VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR,
                                             &buildInfo, &primCount, &sizes);

-    ensure_tlas_storage(sizes.accelerationStructureSize, sizes.buildScratchSize);
+    ensure_tlas_storage(sizes.accelerationStructureSize, sizes.buildScratchSize, dq);

    buildInfo.dstAccelerationStructure = _tlas.handle;
-    AllocatedBuffer scratch = _resources->create_buffer(sizes.buildScratchSize,
+    const VkDeviceSize align2 = _minScratchAlignment;
+    const VkDeviceSize padded2 = sizes.buildScratchSize + (align2 - 1);
+    AllocatedBuffer scratch = _resources->create_buffer(padded2,
                                                        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                                        VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                                        VMA_MEMORY_USAGE_GPU_ONLY);
-    VkDeviceAddress scratchAddr = get_buffer_address(_device->device(), scratch.buffer);
-    buildInfo.scratchData.deviceAddress = scratchAddr;
+    VkDeviceAddress scratchBase2 = get_buffer_address(_device->device(), scratch.buffer);
+    VkDeviceAddress scratchAddr2 = (scratchBase2 + (align2 - 1)) & ~VkDeviceAddress(align2 - 1);
+    buildInfo.scratchData.deviceAddress = scratchAddr2;

    VkAccelerationStructureBuildRangeInfoKHR range{};
    range.primitiveCount = primCount;
--- a/src/core/vk_raytracing.h
+++ b/src/core/vk_raytracing.h
@@ -25,8 +25,9 @@ public:
     // Build (or get) BLAS for a mesh. Safe to call multiple times.
     AccelStructureHandle getOrBuildBLAS(const std::shared_ptr<MeshAsset>& mesh);
 
-     // Rebuild TLAS from current draw context; returns TLAS handle (or null if unavailable)
-    VkAccelerationStructureKHR buildTLASFromDrawContext(const DrawContext& dc);
+    // Rebuild TLAS from current draw context; returns TLAS handle (or null if unavailable)
+    // Destruction of previous TLAS resources is deferred via the provided frame deletion queue
+    VkAccelerationStructureKHR buildTLASFromDrawContext(const DrawContext& dc, DeletionQueue& frameDQ);
    VkAccelerationStructureKHR tlas() const { return _tlas.handle; }
    VkDeviceAddress tlasAddress() const { return _tlas.deviceAddress; }

@@ -34,7 +35,7 @@ public:
    // Safe to call even if no BLAS exists for the buffer.
    void removeBLASForBuffer(VkBuffer vertexBuffer);
 
- private:
+private:
     // function pointers (resolved on init)
     PFN_vkCreateAccelerationStructureKHR            _vkCreateAccelerationStructureKHR{};
     PFN_vkDestroyAccelerationStructureKHR           _vkDestroyAccelerationStructureKHR{};
@@ -42,17 +43,20 @@ public:
     PFN_vkCmdBuildAccelerationStructuresKHR         _vkCmdBuildAccelerationStructuresKHR{};
     PFN_vkGetAccelerationStructureDeviceAddressKHR  _vkGetAccelerationStructureDeviceAddressKHR{};
 
-     DeviceManager* _device{nullptr};
-     ResourceManager* _resources{nullptr};
+    DeviceManager* _device{nullptr};
+    ResourceManager* _resources{nullptr};
 
     // BLAS cache by vertex buffer handle
     std::unordered_map<VkBuffer, AccelStructureHandle> _blasByVB;
 
-     // TLAS + scratch / instance buffer (rebuilt per frame)
-     AccelStructureHandle _tlas{};
-     AllocatedBuffer _tlasInstanceBuffer{};
-     size_t _tlasInstanceCapacity{0};
+    // TLAS + scratch / instance buffer (rebuilt per frame)
+    AccelStructureHandle _tlas{};
+    AllocatedBuffer _tlasInstanceBuffer{};
+    size_t _tlasInstanceCapacity{0};
+
+    // Properties
+    VkDeviceSize _minScratchAlignment{256};
 
-     void ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize requiredScratch);
- };
+    void ensure_tlas_storage(VkDeviceSize requiredASSize, VkDeviceSize requiredScratch, DeletionQueue& frameDQ);
+};
 
--- a/src/render/rg_graph.cpp
+++ b/src/render/rg_graph.cpp
@@ -809,13 +809,16 @@ void RenderGraph::add_present_chain(RGImageHandle sourceDraw,

 RGImageHandle RenderGraph::import_draw_image()
 {
-	RGImportedImageDesc d{};
-	d.name = "drawImage";
-	d.image = _context->getSwapchain()->drawImage().image;
-	d.imageView = _context->getSwapchain()->drawImage().imageView;
-	d.format = _context->getSwapchain()->drawImage().imageFormat;
-	d.extent = _context->getDrawExtent();
-	d.currentLayout = VK_IMAGE_LAYOUT_GENERAL;
+    RGImportedImageDesc d{};
+    d.name = "drawImage";
+    d.image = _context->getSwapchain()->drawImage().image;
+    d.imageView = _context->getSwapchain()->drawImage().imageView;
+    d.format = _context->getSwapchain()->drawImage().imageFormat;
+    d.extent = _context->getDrawExtent();
+    // Treat layout as unknown at frame start to force an explicit barrier
+    // into the first declared usage (compute write / color attach). This
+    // avoids mismatches when the previous frame ended in a different layout.
+    d.currentLayout = VK_IMAGE_LAYOUT_UNDEFINED;
    return import_image(d);
 }

@@ -942,8 +945,10 @@ RGImageHandle RenderGraph::import_swapchain_image(uint32_t index)
 	d.imageView = views[index];
 	d.format = _context->getSwapchain()->swapchainImageFormat();
 	d.extent = _context->getSwapchain()->swapchainExtent();
-	d.currentLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
-	return import_image(d);
+    // On first use after swapchain creation, images are in UNDEFINED layout.
+    // Start from UNDEFINED so the graph inserts the necessary transition.
+    d.currentLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    return import_image(d);
 }

 void RenderGraph::resolve_timings()
@@ -960,7 +965,7 @@ void RenderGraph::resolve_timings()
        _context->getDevice()->device(), _timestampPool,
        0, queryCount,
        sizeof(uint64_t) * results.size(), results.data(), sizeof(uint64_t),
-        VK_QUERY_RESULT_64_BIT);
+        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
    // Convert ticks to ms
    VkPhysicalDeviceProperties props{};
    vkGetPhysicalDeviceProperties(_context->getDevice()->physicalDevice(), &props);
@@ -983,6 +988,8 @@ void RenderGraph::resolve_timings()
        }
    }

+    // Ensure any pending work that might still reference the pool is complete
+    vkQueueWaitIdle(_context->getDevice()->graphicsQueue());
    vkDestroyQueryPool(_context->getDevice()->device(), _timestampPool, nullptr);
    _timestampPool = VK_NULL_HANDLE;
 }
--- a/src/render/rg_resources.cpp
+++ b/src/render/rg_resources.cpp
@@ -1,8 +1,11 @@
 #include <render/rg_resources.h>
 #include <core/engine_context.h>
 #include <core/vk_resource.h>
+#include <vk_mem_alloc.h>
+#include <core/config.h>

 #include "frame_resources.h"
+#include "vk_device.h"

 void RGResourceRegistry::reset()
 {
@@ -53,7 +56,13 @@ RGImageHandle RGResourceRegistry::add_transient(const RGImageDesc& d)
    rec.creationUsage = d.usage;

 	VkExtent3D size{ d.extent.width, d.extent.height, 1 };
-	rec.allocation = _ctx->getResources()->create_image(size, d.format, d.usage);
+    rec.allocation = _ctx->getResources()->create_image(size, d.format, d.usage);
+    // Name the allocation for diagnostics (optional)
+    if (vmaDebugEnabled() && _ctx && _ctx->getDevice())
+    {
+        std::string nm = std::string("rg.image:") + d.name;
+        vmaSetAllocationName(_ctx->getDevice()->allocator(), rec.allocation.allocation, nm.c_str());
+    }
 	rec.image = rec.allocation.image;
 	rec.imageView = rec.allocation.imageView;

--- a/src/render/vk_materials.cpp
+++ b/src/render/vk_materials.cpp
@@ -21,7 +21,8 @@ void GLTFMetallic_Roughness::build_pipelines(VulkanEngine *engine)
    layoutBuilder.add_binding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);

    materialLayout = layoutBuilder.build(engine->_deviceManager->device(),
-                                         VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT);
+                                         VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT,
+                                         nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT);

    VkDescriptorSetLayout layouts[] = {
        engine->_descriptorManager->gpuSceneDataLayout(),
--- a/src/render/vk_renderpass_background.cpp
+++ b/src/render/vk_renderpass_background.cpp
@@ -94,6 +94,6 @@ void BackgroundPass::cleanup()
        _context->pipelines->destroyComputePipeline("gradient");
        _context->pipelines->destroyComputePipeline("sky");
    }
-    fmt::print("RenderPassManager::cleanup()\n");
+    fmt::print("BackgroundPass::cleanup()\n");
    _backgroundEffects.clear();
 }
--- a/src/render/vk_renderpass_lighting.cpp
+++ b/src/render/vk_renderpass_lighting.cpp
@@ -30,7 +30,9 @@ void LightingPass::init(EngineContext *context)
        builder.add_binding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
        builder.add_binding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
        builder.add_binding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
-        _gBufferInputDescriptorLayout = builder.build(_context->getDevice()->device(), VK_SHADER_STAGE_FRAGMENT_BIT);
+        _gBufferInputDescriptorLayout = builder.build(
+            _context->getDevice()->device(), VK_SHADER_STAGE_FRAGMENT_BIT,
+            nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT);
    }

    // Allocate and write GBuffer descriptor set
@@ -51,21 +53,22 @@ void LightingPass::init(EngineContext *context)
    {
        DescriptorLayoutBuilder builder;
        builder.add_binding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, kShadowCascadeCount);
-        _shadowDescriptorLayout = builder.build(_context->getDevice()->device(), VK_SHADER_STAGE_FRAGMENT_BIT);
+        _shadowDescriptorLayout = builder.build(
+            _context->getDevice()->device(), VK_SHADER_STAGE_FRAGMENT_BIT,
+            nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT);
    }

-    // Build lighting pipeline through PipelineManager
+    // Build lighting pipelines (RT and non-RT) through PipelineManager
    VkDescriptorSetLayout layouts[] = {
        _context->getDescriptorLayouts()->gpuSceneDataLayout(),
        _gBufferInputDescriptorLayout,
        _shadowDescriptorLayout
    };

-    GraphicsPipelineCreateInfo info{};
-    info.vertexShaderPath = _context->getAssets()->shaderPath("fullscreen.vert.spv");
-    info.fragmentShaderPath = _context->getAssets()->shaderPath("deferred_lighting.frag.spv");
-    info.setLayouts.assign(std::begin(layouts), std::end(layouts));
-    info.configure = [this](PipelineBuilder &b) {
+    GraphicsPipelineCreateInfo baseInfo{};
+    baseInfo.vertexShaderPath = _context->getAssets()->shaderPath("fullscreen.vert.spv");
+    baseInfo.setLayouts.assign(std::begin(layouts), std::end(layouts));
+    baseInfo.configure = [this](PipelineBuilder &b) {
        b.set_input_topology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
        b.set_polygon_mode(VK_POLYGON_MODE_FILL);
        b.set_cull_mode(VK_CULL_MODE_NONE, VK_FRONT_FACE_CLOCKWISE);
@@ -74,13 +77,16 @@ void LightingPass::init(EngineContext *context)
        b.disable_depthtest();
        b.set_color_attachment_format(_context->getSwapchain()->drawImage().imageFormat);
    };
-    _context->pipelines->createGraphicsPipeline("deferred_lighting", info);

-    // fetch the handles so current frame uses latest versions
-    MaterialPipeline mp{};
-    _context->pipelines->getMaterialPipeline("deferred_lighting", mp);
-    _pipeline = mp.pipeline;
-    _pipelineLayout = mp.layout;
+    // Non-RT variant (no TLAS required)
+    auto infoNoRT = baseInfo;
+    infoNoRT.fragmentShaderPath = _context->getAssets()->shaderPath("deferred_lighting_nort.frag.spv");
+    _context->pipelines->createGraphicsPipeline("deferred_lighting.nort", infoNoRT);
+
+    // RT variant (requires GL_EXT_ray_query and TLAS bound at set=0,binding=1)
+    auto infoRT = baseInfo;
+    infoRT.fragmentShaderPath = _context->getAssets()->shaderPath("deferred_lighting.frag.spv");
+    _context->pipelines->createGraphicsPipeline("deferred_lighting.rt", infoRT);

    _deletionQueue.push_function([&]() {
        // Pipelines are owned by PipelineManager; only destroy our local descriptor set layout
@@ -145,8 +151,20 @@ void LightingPass::draw_lighting(VkCommandBuffer cmd,
    VkImageView drawView = resources.image_view(drawHandle);
    if (drawView == VK_NULL_HANDLE) return;

-    // Re-fetch pipeline in case it was hot-reloaded
-    pipelineManager->getGraphics("deferred_lighting", _pipeline, _pipelineLayout);
+    // Choose RT only if TLAS is valid; otherwise fall back to non-RT.
+    const bool haveRTFeatures = ctxLocal->getDevice()->supportsAccelerationStructure();
+    const VkAccelerationStructureKHR tlas = (ctxLocal->ray ? ctxLocal->ray->tlas() : VK_NULL_HANDLE);
+    const VkDeviceAddress tlasAddr = (ctxLocal->ray ? ctxLocal->ray->tlasAddress() : 0);
+    const bool useRT = haveRTFeatures && (ctxLocal->shadowSettings.mode != 0u) && (tlas != VK_NULL_HANDLE) && (tlasAddr != 0);
+
+    const char* pipeName = useRT ? "deferred_lighting.rt" : "deferred_lighting.nort";
+    if (!pipelineManager->getGraphics(pipeName, _pipeline, _pipelineLayout))
+    {
+        // Try the other variant as a fallback
+        const char* fallback = useRT ? "deferred_lighting.nort" : "deferred_lighting.rt";
+        if (!pipelineManager->getGraphics(fallback, _pipeline, _pipelineLayout))
+            return; // Neither pipeline is ready
+    }

    // Dynamic rendering is handled by the RenderGraph using the declared draw attachment.

@@ -168,14 +186,10 @@ void LightingPass::draw_lighting(VkCommandBuffer cmd,
        deviceManager->device(), descriptorLayouts->gpuSceneDataLayout());
    DescriptorWriter writer;
    writer.write_buffer(0, gpuSceneDataBuffer.buffer, sizeof(GPUSceneData), 0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
-    // If TLAS available and feature enabled, bind it at (set=0,binding=1)
-    if (ctxLocal->ray && ctxLocal->getDevice()->supportsAccelerationStructure() && ctxLocal->shadowSettings.mode != 0u)
+    // Only write TLAS when using the RT pipeline and we have a valid TLAS
+    if (useRT)
    {
-        VkAccelerationStructureKHR tlas = ctxLocal->ray->tlas();
-        if (tlas != VK_NULL_HANDLE)
-        {
-            writer.write_acceleration_structure(1, tlas);
-        }
+        writer.write_acceleration_structure(1, tlas);
    }
    writer.update_set(deviceManager->device(), globalDescriptor);

--- a/src/render/vk_renderpass_shadow.cpp
+++ b/src/render/vk_renderpass_shadow.cpp
@@ -47,6 +47,7 @@ void ShadowPass::init(EngineContext *context)
        b.set_multisampling_none();
        b.disable_blending();

+        // Keep reverse-Z convention for shadow maps to match engine depth usage
        b.enable_depthtest(true, VK_COMPARE_OP_GREATER_OR_EQUAL);
        b.set_depth_format(VK_FORMAT_D32_SFLOAT);

--- a/src/scene/vk_loader.cpp
+++ b/src/scene/vk_loader.cpp
@@ -6,6 +6,7 @@
 #include "render/vk_materials.h"
 #include "core/vk_initializers.h"
 #include "core/vk_types.h"
+#include "core/config.h"
 #include <glm/gtx/quaternion.hpp>

 #include <fastgltf/glm_element_traits.hpp>
@@ -42,6 +43,9 @@ std::optional<AllocatedImage> load_image(VulkanEngine *engine, fastgltf::Asset &
                    VkFormat fmt = srgb ? VK_FORMAT_R8G8B8A8_SRGB : VK_FORMAT_R8G8B8A8_UNORM;
                    newImage = engine->_resourceManager->create_image(
                        data, imagesize, fmt, VK_IMAGE_USAGE_SAMPLED_BIT, false);
+                    // Name the allocation for diagnostics
+                    if (vmaDebugEnabled())
+                        vmaSetAllocationName(engine->_deviceManager->allocator(), newImage.allocation, path.c_str());

                    stbi_image_free(data);
                }
@@ -59,6 +63,8 @@ std::optional<AllocatedImage> load_image(VulkanEngine *engine, fastgltf::Asset &
                    VkFormat fmt = srgb ? VK_FORMAT_R8G8B8A8_SRGB : VK_FORMAT_R8G8B8A8_UNORM;
                    newImage = engine->_resourceManager->create_image(
                        data, imagesize, fmt, VK_IMAGE_USAGE_SAMPLED_BIT, false);
+                    if (vmaDebugEnabled())
+                        vmaSetAllocationName(engine->_deviceManager->allocator(), newImage.allocation, "gltf.vector.image");

                    stbi_image_free(data);
                }
@@ -86,8 +92,10 @@ std::optional<AllocatedImage> load_image(VulkanEngine *engine, fastgltf::Asset &
                                       imagesize.depth = 1;

                                       VkFormat fmt = srgb ? VK_FORMAT_R8G8B8A8_SRGB : VK_FORMAT_R8G8B8A8_UNORM;
-                                       newImage = engine->_resourceManager->create_image(
+                    newImage = engine->_resourceManager->create_image(
                                           data, imagesize, fmt, VK_IMAGE_USAGE_SAMPLED_BIT, false);
+                                       if (vmaDebugEnabled())
+                                           vmaSetAllocationName(engine->_deviceManager->allocator(), newImage.allocation, "gltf.bufferview.image");

                                       stbi_image_free(data);
                                   }
@@ -256,22 +264,33 @@ std::optional<std::shared_ptr<LoadedGLTF> > loadGltf(VulkanEngine *engine, std::
    //< load_arrays

    // load all textures
-    for (fastgltf::Image &image: gltf.images)
+    for (size_t i = 0; i < gltf.images.size(); ++i)
    {
+        fastgltf::Image &image = gltf.images[i];
        // Default-load GLTF images as linear; baseColor is reloaded as sRGB when bound
        std::optional<AllocatedImage> img = load_image(engine, gltf, image, false);

        if (img.has_value())
        {
            images.push_back(*img);
-            file.images[image.name.c_str()] = *img;
+            // Use a unique, stable key so every allocation is tracked and later freed.
+            std::string key = image.name.empty() ? (std::string("gltf.image.") + std::to_string(i))
+                                                 : std::string(image.name.c_str());
+            // Avoid accidental collisions from duplicate names
+            int suffix = 1;
+            while (file.images.find(key) != file.images.end())
+            {
+                key = (image.name.empty() ? std::string("gltf.image.") + std::to_string(i)
+                                          : std::string(image.name.c_str())) + std::string("#") + std::to_string(suffix++);
+            }
+            file.images[key] = *img;
        }
        else
        {
            // we failed to load, so lets give the slot a default white texture to not
            // completely break loading
            images.push_back(engine->_errorCheckerboardImage);
-            std::cout << "gltf failed to load texture " << image.name << std::endl;
+            std::cout << "gltf failed to load texture index " << i << " (name='" << image.name << "')" << std::endl;
        }
    }

--- a/src/scene/vk_scene.cpp
+++ b/src/scene/vk_scene.cpp
@@ -198,6 +198,13 @@ std::shared_ptr<LoadedGLTF> SceneManager::getScene(const std::string &name)

 void SceneManager::cleanup()
 {
+    // Explicitly clear dynamic instances first to drop any extra shared_ptrs
+    // that could keep GPU resources alive.
+    clearMeshInstances();
+    clearGLTFInstances();
+
+    // Drop our references to GLTF scenes. Their destructors call clearAll()
+    // exactly once to release GPU resources.
    loadedScenes.clear();
    loadedNodes.clear();
 }