user369070
user369070

Reputation: 635

Weird compute shader latency

I'm trying to make frustrum culling via compute shader. For that I have a pair of buffers for instanced vertex attributes, and a pair of buffers for indirect draw commands. My compute shader checks if instance coordinates from first buffer are within bounding volume, referencing first draw buffer for counts, subgroupBallot and bitCount to see offset within subgroup, then add results from other subgroups and a global offset, and finally stores the result in second buffer. The global offset is stored in second indirect draw buffer.

The problem is that, when under load, frustum may be few(>1) frames late to the moving camera, with wide lines of disappeared objects on edge. It seems weird to me because culling and rendering are done within same command buffer.

When taking capture in renderdoc, taking a screenshot alt+printScreen, or pausing the render-present thread, things snap back to as they should be.

My only guess is that compute shader from past frame continues to execute even when new frame starts to be drawn, though this should not be happening due to pipeline barriers.

Shader code:

#version 460

#extension GL_KHR_shader_subgroup_ballot : require

struct drawData{
    uint indexCount;
    uint instanceCount;
    uint firstIndex;
    uint vertexOffset;
    uint firstInstance;
};

struct instanceData{
    float x, y, z;
    float a, b, c, d;
};

layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;

layout(set = 0, binding = 0) uniform A
{
    mat4 cam;
    vec4 camPos;
    vec4 l;
    vec4 t;
    vec4 r;
    vec4 b;
};

layout(set = 0, binding = 1) buffer B
{
    uint count;
    drawData data[];
} Draw[2];

layout(set = 0, binding = 2) buffer C
{
    instanceData data[];
} Instance[2];

shared uint offsetsM[32];

void main()
{
    const uint gID = gl_LocalInvocationID.x;
    const uint lID = gl_SubgroupInvocationID;
    const uint patchSize = gl_WorkGroupSize.x;
        Draw[1].data[0] = Draw[0].data[0];//copy data like index count
        
    Draw[1].count = Draw[0].count;
    
    uint offsetG = 0;//accumulating offset within end buffer
    
    uint loops = Draw[0].data[0].instanceCount/patchSize;//constant loop count
    for(uint i = 0; i<loops;++i){
        uint posa = i*patchSize+gID;//runs better this way for some reason
        
        vec3   pos  = camPos.xyz-vec3(Instance[0].data[posa].x, Instance[0].data[posa].y, Instance[0].data[posa].z);//position relative to camera
        mat4x3 lrtb = mat4x3(l.xyz, r.xyz, t.xyz, b.xyz);
        vec4   dist = pos*lrtb+Model.data[0].rad;//dot products and radius tolerance
        bool   Pass = posa<Draw[0].data[0].instanceCount&&//is real
                     (dot(pos, pos)<l.w*l.w)            &&//not too far
                  all(greaterThan(dist, vec4(0)));        //within view frustum
        
        subgroupBarrier();//no idea what is the best, put what works
        uvec4 actives = subgroupBallot(Pass);//count passed instances
        if(subgroupElect())
            offsetsM[gl_SubgroupID] = bitCount(actives).x+bitCount(actives).y;
        barrier();
        
            uint offsetL = bitCount(actives&gl_SubgroupLtMask).x+bitCount(actives&gl_SubgroupLtMask).y;//offset withing subgroup
            uint ii = 0;
        if(Pass){
             for(; ii<gl_SubgroupID; ++ii)
                 offsetG+= offsetsM[ii];//offsets before subgroup
             Instance[1].data[offsetG+offsetL] = Instance[0].data[posa];
             for(; ii<gl_NumSubgroups; ++ii)
                 offsetG+= offsetsM[ii];}//offsets after subgroup
        else for(; ii<gl_NumSubgroups; ++ii)
                 offsetG+= offsetsM[ii];//same but no data copying
    }
    if(gID == 0)
        Draw[1].data[0].instanceCount = offsetG;
}

For renderpass after the compute I have dependencies:

{//1
deps[1].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[1].dstSubpass = 0;
deps[1].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[1].dstStageMask = VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
deps[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[1].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
deps[1].dependencyFlags = 0;
}
{//2
deps[2].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[2].dstSubpass = 0;
deps[2].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[2].dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
deps[2].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[2].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
deps[2].dependencyFlags = 0;
}

The command buffer is(fully reused as is, one for each image in swapchain):

vkBeginCommandBuffer(cmd, &begInfo);

    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layoutsPipe[1],
                            0, 1, &descs[1], 0, 0);
    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipes[1]);
    vkCmdDispatch(cmd, 1, 1, 1);

    VkBufferMemoryBarrier bufMemBar[2];
    {//mem bars
        {//0 indirect
            bufMemBar[0].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
            bufMemBar[0].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
            bufMemBar[0].buffer = bufferIndirect;
            bufMemBar[0].offset = 0;
            bufMemBar[0].size   = -1;
        }
        {//1 vertex instance
            bufMemBar[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
            bufMemBar[1].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
            bufMemBar[1].buffer = bufferInstance;
            bufMemBar[1].offset = 0;
            bufMemBar[1].size   = -1;
        }
    }
    vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                         VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, 0, 0, 0, 1, &bufMemBar[0], 0, 0);
    vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                         VK_PIPELINE_STAGE_VERTEX_INPUT_BIT , 0, 0, 0, 1, &bufMemBar[1], 0, 0);

    VkRenderPassBeginInfo passBegInfo;
    passBegInfo.renderPass  = pass;
    passBegInfo.framebuffer = chain.frames[i];
    passBegInfo.renderArea  = {{0, 0}, chain.dim};
        VkClearValue clears[2]{{0},{0}};
    passBegInfo.clearValueCount = 2;
    passBegInfo.pClearValues    = clears;
vkCmdBeginRenderPass(cmd, &passBegInfo, VK_SUBPASS_CONTENTS_INLINE);
    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, layoutsPipe[0], 0, 1, &descs[0], 0, 0);
    vkCmdBindPipeline      (cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipes[0]);
        VkBuffer     buffersVertex[2]{bufferVertexProto, bufferInstance};
        VkDeviceSize offsetsVertex[2]{0, 0};
    vkCmdBindVertexBuffers(cmd, 0, 2, buffersVertex, offsetsVertex);
    vkCmdBindIndexBuffer  (cmd, bufferIndex, 0, VK_INDEX_TYPE_UINT32);

    vkCmdDrawIndexedIndirectCount(cmd, bufferIndirect, 0+4,
                                       bufferIndirect, 0,
                                  count.maxDraws, sizeof(VkDrawIndexedIndirectCommand));
vkCmdEndRenderPass(cmd);

vkEndCommandBuffer(cmd);

Rendering and presentation are synchronised with two semaphores - imageAvailable, and renderFinished. Frustum calculation is in right order on CPU. Validation layers are enabled.

Upvotes: 1

Views: 198

Answers (1)

user369070
user369070

Reputation: 635

The problem was that I lacked host synchronisation. Indeed, even within same command buffer, there are no host synchronisation guarantees (and that makes sense, since it enables us to use events).

Upvotes: 1

Related Questions