Michele Colle
Michele Colle

Reputation: 48

improve performance of non local mean NLM openGL kernel

I wrote a 3D non local mean denoising function using openGL compute shaders in a windows form application via openTK, this function performs a denoising on float grayscale volumetric images (voxel), the algorithm is proposed in https://hal.archives-ouvertes.fr/hal-00271141/document chapter 5 and essentially weighs the contribution of voxels across the images based on a similarity window.

In this case I used a 3x3x3 similarity window i evaluate a "block" of dimension block * block * block on a stack of 300x300x300 voxels, obviously the bigger the "block" is the better the denoiser works, a block size of 10 takes 8 sec and a block of 20 takes 50 sec, if I try a block of 30 the app shuts down with this error: The program '[12432] ImageView_2.exe' has exited with code 3 (0x3).

I wonder if there is a way to improve the performance of the function and a way of preventing the application exiting with big block sizes.

Here are the images: original, denoise10, denoise20.

this is the Shader:


#version 450 core

layout (local_size_x = 8, local_size_y = 8, local_size_z = 8) in;
layout(r32f) uniform image3D output_imgTx;
layout(r32f) uniform image3D input_imgTx;

uniform ivec3 size;
uniform float gauss_ker[27];
uniform int block;



float make_weight_at(int bx, int by, int bz, int wd, float va[27], float gauss[27])
{
    int tm = wd / 2;
    double weight = 0;
    
    int l = size[2];
    int h = size[1];
    int w = size[0];
    for (int z = 0; z < wd; z++)
    {
        int zv = int(mod(bz + z - tm, l));
        for (int y = 0; y < wd; y++)
        {
            int yv = int(mod(by + y - tm, h));
            for (int x = 0; x < wd; x++)
            {
                int xv = int(mod(bx + x - tm, w));
                int i = z * wd * wd + y * wd + x;
                float u = imageLoad(input_imgTx,ivec3(xv,yv,zv)).x;

                weight += gauss[i]*(va[i] -u)*(va[i] -u);
            }
        }
    }
    return float(weight);
}


void main(void)
{


double a = 0.0513199523517382;
double b = 118.680044994534;
int side = 3;
float h_par = float(b/1.5);



    int l = size[2];
    int h = size[1];
    int w = size[0];
    
    ivec3 p = ivec3(gl_GlobalInvocationID.xyz);

    vec3 ctr = vec3(size[0]/2.0, size[1]/2.0, size[2]/2.0); 
    vec2 rp = vec2(p.xy)-ctr.xy;

    if(p.z>=l)
        return;

    if(length(rp)<=float(w)/2.0f)
    {

        float v[27];
        int wd = side;
        int tm = wd/2;
        for (int z = 0; z < wd; z++)
        {
            int zv = int(mod(p.z + z - tm, l));
            for (int y = 0; y < wd; y++)
            {
                int yv = int(mod(p.y + y - tm, h));
                for (int x = 0; x < wd; x++)
                {
                    int xv = int(mod(p.x + x - tm, w));
                    int i = z*wd*wd + y*wd + x;
                    v[i] = imageLoad(input_imgTx,ivec3(xv,yv,zv)).x;
                }
            }
        }
        
        double sumo = 0;
        double normalizingFactor = 0;
                
        for (int k = 0; k < block; k++)
        {
            for (int j = 0; j < block; j++)
            {
                for (int i = 0; i < block; i++)
                {
                    double val2 = double(exp(-make_weight_at(p.x + i - block/2, p.y + j- block/2, p.z + k- block/2, side, v, gauss_ker)/h_par/h_par));
                    normalizingFactor += val2;
                    sumo += imageLoad(input_imgTx,ivec3(int(mod(p.x + i- block/2, w)),int(mod(p.y + j- block/2, h)),int(mod(p.z + k- block/2, l)))).x*val2;
                }
            }
        }

        imageStore(output_imgTx, p, vec4(float(sumo/ normalizingFactor),0,0,0));
    }
    else
    {
        imageStore(output_imgTx, p, vec4(0,0,0,0));
    }
}

this is the call to the function:


protected override void OnLoad(EventArgs e)
{
    _debugProcCallbackHandle = System.Runtime.InteropServices.GCHandle.Alloc(_debugProcCallback);

    GL.DebugMessageCallback(_debugProcCallback, IntPtr.Zero);
    GL.Enable(EnableCap.DebugOutput);
    GL.Enable(EnableCap.DebugOutputSynchronous);

    this.Visible = false;
    nlm_shader = new Shader(nlm_str, false);

    int wdt = stk[0].Width;
    int hgt = stk[0].Height;
    int len = stk.Length;
                    

    inpuitimgae = GL.GenTexture();
    GL.ActiveTexture(TextureUnit.Texture0);
    GL.BindTexture(TextureTarget.Texture3D, inpuitimgae);
    GL.TexImage3D(TextureTarget.Texture3D, 0, PixelInternalFormat.R32f, wdt, hgt, len, 0, PixelFormat.Red, PixelType.Float, IntPtr.Zero);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureMinFilter, (int)TextureMinFilter.Linear);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureMagFilter, (int)TextureMagFilter.Linear);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapS, (int)TextureWrapMode.ClampToEdge);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapT, (int)TextureWrapMode.ClampToEdge);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapR, (int)TextureWrapMode.ClampToEdge);
    for (int i = 0; i <len; i++)
    {
        GL.TexSubImage3D(TextureTarget.Texture3D, 0, 0,0 , i, wdt, hgt, 1, PixelFormat.Red, PixelType.Float, stk[i].Bytes);
    }

    GL.BindImageTexture(0, inpuitimgae, 0, true, 0, TextureAccess.ReadOnly, SizedInternalFormat.R32f);

    nlm_shader.Use();
    nlm_shader.SetInt("input_imgTx", 0);

    outputimg = GL.GenTexture();
    GL.ActiveTexture(TextureUnit.Texture1);
    GL.BindTexture(TextureTarget.Texture3D, outputimg);
    GL.TexImage3D(TextureTarget.Texture3D, 0, PixelInternalFormat.R32f, wdt, hgt, len, 0, PixelFormat.Red, PixelType.Float, IntPtr.Zero);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureMinFilter, (int)TextureMinFilter.Linear);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureMagFilter, (int)TextureMagFilter.Linear);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapS, (int)TextureWrapMode.ClampToEdge);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapT, (int)TextureWrapMode.ClampToEdge);
    GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapR, (int)TextureWrapMode.ClampToEdge);
    GL.BindImageTexture(1, outputimg, 0, true, 0, TextureAccess.WriteOnly, SizedInternalFormat.R32f);



                    
    nlm_shader.SetInt("output_imgTx", 1);

    nlm_shader.Setivec3("size", new int[] { stk[0].Width, stk[0].Height, stk.Length });
    nlm_shader.SetInt("block", 30);

    float[] gauss_ker = make_square_gauss_kernel(3, 0.5);
    nlm_shader.setFloatArray("gauss_ker[0]", gauss_ker);

    GL.DispatchCompute(wdt/8+1, hgt/8+1, len/8+1);
    GL.MemoryBarrier(MemoryBarrierFlags.AllBarrierBits);

    GL.BindTexture(TextureTarget.Texture3D, outputimg);
    GL.ActiveTexture(TextureUnit.Texture1);


    float[] arrp1 = new float[wdt * hgt * len];
    GL.GetTextureImage(outputimg, 0, PixelFormat.Red, PixelType.Float, wdt * hgt * len* 4, arrp1);


    for (int i = 0; i < len; i++)
    {
        System.Buffer.BlockCopy(arrp1, i * wdt * hgt * 4, stk[i].ManagedArray, 0, wdt * hgt * 4);
    }

    base.OnLoad(e);
}

Upvotes: 1

Views: 105

Answers (0)

Related Questions