Reputation: 635
I'm trying to make frustrum culling via compute shader. For that I have a pair of buffers for instanced vertex attributes, and a pair of buffers for indirect draw commands. My compute shader checks if instance coordinates from first buffer are within bounding volume, referencing first draw buffer for counts, subgroupBallot
and bitCount
to see offset within subgroup, then add results from other subgroups and a global offset, and finally stores the result in second buffer. The global offset is stored in second indirect draw buffer.
The problem is that, when under load, frustum may be few(>1) frames late to the moving camera, with wide lines of disappeared objects on edge. It seems weird to me because culling and rendering are done within same command buffer.
When taking capture in renderdoc, taking a screenshot alt+printScreen, or pausing the render-present thread, things snap back to as they should be.
My only guess is that compute shader from past frame continues to execute even when new frame starts to be drawn, though this should not be happening due to pipeline barriers.
Shader code:
#version 460
#extension GL_KHR_shader_subgroup_ballot : require
struct drawData{
uint indexCount;
uint instanceCount;
uint firstIndex;
uint vertexOffset;
uint firstInstance;
};
struct instanceData{
float x, y, z;
float a, b, c, d;
};
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
layout(set = 0, binding = 0) uniform A
{
mat4 cam;
vec4 camPos;
vec4 l;
vec4 t;
vec4 r;
vec4 b;
};
layout(set = 0, binding = 1) buffer B
{
uint count;
drawData data[];
} Draw[2];
layout(set = 0, binding = 2) buffer C
{
instanceData data[];
} Instance[2];
shared uint offsetsM[32];
void main()
{
const uint gID = gl_LocalInvocationID.x;
const uint lID = gl_SubgroupInvocationID;
const uint patchSize = gl_WorkGroupSize.x;
Draw[1].data[0] = Draw[0].data[0];//copy data like index count
Draw[1].count = Draw[0].count;
uint offsetG = 0;//accumulating offset within end buffer
uint loops = Draw[0].data[0].instanceCount/patchSize;//constant loop count
for(uint i = 0; i<loops;++i){
uint posa = i*patchSize+gID;//runs better this way for some reason
vec3 pos = camPos.xyz-vec3(Instance[0].data[posa].x, Instance[0].data[posa].y, Instance[0].data[posa].z);//position relative to camera
mat4x3 lrtb = mat4x3(l.xyz, r.xyz, t.xyz, b.xyz);
vec4 dist = pos*lrtb+Model.data[0].rad;//dot products and radius tolerance
bool Pass = posa<Draw[0].data[0].instanceCount&&//is real
(dot(pos, pos)<l.w*l.w) &&//not too far
all(greaterThan(dist, vec4(0))); //within view frustum
subgroupBarrier();//no idea what is the best, put what works
uvec4 actives = subgroupBallot(Pass);//count passed instances
if(subgroupElect())
offsetsM[gl_SubgroupID] = bitCount(actives).x+bitCount(actives).y;
barrier();
uint offsetL = bitCount(actives&gl_SubgroupLtMask).x+bitCount(actives&gl_SubgroupLtMask).y;//offset withing subgroup
uint ii = 0;
if(Pass){
for(; ii<gl_SubgroupID; ++ii)
offsetG+= offsetsM[ii];//offsets before subgroup
Instance[1].data[offsetG+offsetL] = Instance[0].data[posa];
for(; ii<gl_NumSubgroups; ++ii)
offsetG+= offsetsM[ii];}//offsets after subgroup
else for(; ii<gl_NumSubgroups; ++ii)
offsetG+= offsetsM[ii];//same but no data copying
}
if(gID == 0)
Draw[1].data[0].instanceCount = offsetG;
}
For renderpass after the compute I have dependencies:
{//1
deps[1].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[1].dstSubpass = 0;
deps[1].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[1].dstStageMask = VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
deps[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[1].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
deps[1].dependencyFlags = 0;
}
{//2
deps[2].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[2].dstSubpass = 0;
deps[2].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[2].dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
deps[2].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[2].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
deps[2].dependencyFlags = 0;
}
The command buffer is(fully reused as is, one for each image in swapchain):
vkBeginCommandBuffer(cmd, &begInfo);
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layoutsPipe[1],
0, 1, &descs[1], 0, 0);
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipes[1]);
vkCmdDispatch(cmd, 1, 1, 1);
VkBufferMemoryBarrier bufMemBar[2];
{//mem bars
{//0 indirect
bufMemBar[0].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
bufMemBar[0].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
bufMemBar[0].buffer = bufferIndirect;
bufMemBar[0].offset = 0;
bufMemBar[0].size = -1;
}
{//1 vertex instance
bufMemBar[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
bufMemBar[1].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
bufMemBar[1].buffer = bufferInstance;
bufMemBar[1].offset = 0;
bufMemBar[1].size = -1;
}
}
vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, 0, 0, 0, 1, &bufMemBar[0], 0, 0);
vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT , 0, 0, 0, 1, &bufMemBar[1], 0, 0);
VkRenderPassBeginInfo passBegInfo;
passBegInfo.renderPass = pass;
passBegInfo.framebuffer = chain.frames[i];
passBegInfo.renderArea = {{0, 0}, chain.dim};
VkClearValue clears[2]{{0},{0}};
passBegInfo.clearValueCount = 2;
passBegInfo.pClearValues = clears;
vkCmdBeginRenderPass(cmd, &passBegInfo, VK_SUBPASS_CONTENTS_INLINE);
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, layoutsPipe[0], 0, 1, &descs[0], 0, 0);
vkCmdBindPipeline (cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipes[0]);
VkBuffer buffersVertex[2]{bufferVertexProto, bufferInstance};
VkDeviceSize offsetsVertex[2]{0, 0};
vkCmdBindVertexBuffers(cmd, 0, 2, buffersVertex, offsetsVertex);
vkCmdBindIndexBuffer (cmd, bufferIndex, 0, VK_INDEX_TYPE_UINT32);
vkCmdDrawIndexedIndirectCount(cmd, bufferIndirect, 0+4,
bufferIndirect, 0,
count.maxDraws, sizeof(VkDrawIndexedIndirectCommand));
vkCmdEndRenderPass(cmd);
vkEndCommandBuffer(cmd);
Rendering and presentation are synchronised with two semaphores - imageAvailable, and renderFinished. Frustum calculation is in right order on CPU. Validation layers are enabled.
Upvotes: 1
Views: 198
Reputation: 635
The problem was that I lacked host synchronisation. Indeed, even within same command buffer, there are no host synchronisation guarantees (and that makes sense, since it enables us to use events).
Upvotes: 1