Why Compute Shader is slowing down the rendering API calls?

Question

I am using compute shader to process the input buffer data and store it as output texture using imagestore().

After executing the compute shader, I have 3 render calls sequentially.

Compute Shader Code:

#version 310 es
precision mediump image2D;
layout(std430) buffer; // Sets the default layout for SSBOs 
layout(local_size_x = 256) in; // 256 threads per work group
layout(binding = 0) readonly buffer InputBuf 
{
    uint input_buff[];
} inputbuff;
layout (rgba32f, binding = 1 ) uniform writeonly image2D out_teximg;
void main()
{
    int idx = int(gl_GlobalInvocationID.x);
    int idy = int(gl_GlobalInvocationID.y);
    unsigned int inputpix = inputbuff[1024 * idy + idx];
    // some calculation on inputpix and output is rcolor, bcolor, gcolor
    imageStore(out_teximg, ivec2(idx , idy), vec4(rcolor, bcolor, gcolor, 1.0)); 
    barrier();
};

Code:

void initCompute()
{
    glGenTextures(1, &computeOutTex);
    glGenBuffers(1, &inSSBOId);
}

uint inputBuffData = { .... }; // input buffer data
void execute_compute()
{
    // compute shader code starts...
    glUseProgram(computePgmId);
    glActiveTexture(GL_TEXTURE0);
    glBindTexture(GL_TEXTURE_2D, computeOutTex);
    glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA32F, width, height);

    glBindImageTexture(1, computeOutTex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA32F); // binding is 1 
    glUniform1i( glGetUniformLocation(computePgmId, "out_teximg"), 0);
    
    uint inputBuffSize = 1024 * 512 * 3;
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, inSSBOId);
    glBufferData(GL_SHADER_STORAGE_BUFFER, inputBuffSize, inputBuffData, GL_STATIC_DRAW);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER,  0 , inSSBOId); // binding is 0

    glDispatchCompute(width / 256, height, 1);
    glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    // glFinish();
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
    glBindImageTexture(1, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA32F);  // binding is 1 
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, 0);// binding is 0
}

int draw()
{
    glBindFramebuffer(GL_FRAMEBUFFER, m_FBOId); // Offscreen Rendering
    glClear(GL_COLOR_BUFFER_BIT);
    glUseProgram(compute_pgm); 
    execute_compute();

    glUseProgram(render_pgm1);
    glViewport(0,0,w,h);
    glActiveTexture(GL_TEXTURE0);
    glBindTexture(GL_TEXTURE_2D, computeOutTex);
    glDrawElements(); // Render the texture data

  // 2nd draw call 
    glUseProgram(render_pgm2);
    ....
    ....
    glDrawElements();

  // 3rd draw call
    glUseProgram(render_pgm3);
    ....
    ....
    glDrawElements(); 
    
    glBindFramebuffer(GL_FRAMEBUFFER, 0); // unbind FBO
}

Here, the only 2nd draw call is taking more time after using compute shader.

If glFinish() is called after glMemoryBarrier(), then only execute_compute() call is slowed down. Why compute shader is slowing down the subsequent draw calls? Is glFinish() really needed?

Why Compute Shader is slowing down the rendering API calls?

Answers (1)

Related Questions