Reputation: 48
I wrote a 3D non local mean denoising function using openGL compute shaders in a windows form application via openTK, this function performs a denoising on float grayscale volumetric images (voxel), the algorithm is proposed in https://hal.archives-ouvertes.fr/hal-00271141/document chapter 5 and essentially weighs the contribution of voxels across the images based on a similarity window.
In this case I used a 3x3x3 similarity window i evaluate a "block" of dimension block * block * block
on a stack of 300x300x300 voxels, obviously the bigger the "block" is the better the denoiser works, a block size of 10 takes 8 sec and a block of 20 takes 50 sec, if I try a block of 30 the app shuts down with this error: The program '[12432] ImageView_2.exe' has exited with code 3 (0x3).
I wonder if there is a way to improve the performance of the function and a way of preventing the application exiting with big block sizes.
Here are the images: original, denoise10, denoise20.
this is the Shader:
#version 450 core
layout (local_size_x = 8, local_size_y = 8, local_size_z = 8) in;
layout(r32f) uniform image3D output_imgTx;
layout(r32f) uniform image3D input_imgTx;
uniform ivec3 size;
uniform float gauss_ker[27];
uniform int block;
float make_weight_at(int bx, int by, int bz, int wd, float va[27], float gauss[27])
{
int tm = wd / 2;
double weight = 0;
int l = size[2];
int h = size[1];
int w = size[0];
for (int z = 0; z < wd; z++)
{
int zv = int(mod(bz + z - tm, l));
for (int y = 0; y < wd; y++)
{
int yv = int(mod(by + y - tm, h));
for (int x = 0; x < wd; x++)
{
int xv = int(mod(bx + x - tm, w));
int i = z * wd * wd + y * wd + x;
float u = imageLoad(input_imgTx,ivec3(xv,yv,zv)).x;
weight += gauss[i]*(va[i] -u)*(va[i] -u);
}
}
}
return float(weight);
}
void main(void)
{
double a = 0.0513199523517382;
double b = 118.680044994534;
int side = 3;
float h_par = float(b/1.5);
int l = size[2];
int h = size[1];
int w = size[0];
ivec3 p = ivec3(gl_GlobalInvocationID.xyz);
vec3 ctr = vec3(size[0]/2.0, size[1]/2.0, size[2]/2.0);
vec2 rp = vec2(p.xy)-ctr.xy;
if(p.z>=l)
return;
if(length(rp)<=float(w)/2.0f)
{
float v[27];
int wd = side;
int tm = wd/2;
for (int z = 0; z < wd; z++)
{
int zv = int(mod(p.z + z - tm, l));
for (int y = 0; y < wd; y++)
{
int yv = int(mod(p.y + y - tm, h));
for (int x = 0; x < wd; x++)
{
int xv = int(mod(p.x + x - tm, w));
int i = z*wd*wd + y*wd + x;
v[i] = imageLoad(input_imgTx,ivec3(xv,yv,zv)).x;
}
}
}
double sumo = 0;
double normalizingFactor = 0;
for (int k = 0; k < block; k++)
{
for (int j = 0; j < block; j++)
{
for (int i = 0; i < block; i++)
{
double val2 = double(exp(-make_weight_at(p.x + i - block/2, p.y + j- block/2, p.z + k- block/2, side, v, gauss_ker)/h_par/h_par));
normalizingFactor += val2;
sumo += imageLoad(input_imgTx,ivec3(int(mod(p.x + i- block/2, w)),int(mod(p.y + j- block/2, h)),int(mod(p.z + k- block/2, l)))).x*val2;
}
}
}
imageStore(output_imgTx, p, vec4(float(sumo/ normalizingFactor),0,0,0));
}
else
{
imageStore(output_imgTx, p, vec4(0,0,0,0));
}
}
this is the call to the function:
protected override void OnLoad(EventArgs e)
{
_debugProcCallbackHandle = System.Runtime.InteropServices.GCHandle.Alloc(_debugProcCallback);
GL.DebugMessageCallback(_debugProcCallback, IntPtr.Zero);
GL.Enable(EnableCap.DebugOutput);
GL.Enable(EnableCap.DebugOutputSynchronous);
this.Visible = false;
nlm_shader = new Shader(nlm_str, false);
int wdt = stk[0].Width;
int hgt = stk[0].Height;
int len = stk.Length;
inpuitimgae = GL.GenTexture();
GL.ActiveTexture(TextureUnit.Texture0);
GL.BindTexture(TextureTarget.Texture3D, inpuitimgae);
GL.TexImage3D(TextureTarget.Texture3D, 0, PixelInternalFormat.R32f, wdt, hgt, len, 0, PixelFormat.Red, PixelType.Float, IntPtr.Zero);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureMinFilter, (int)TextureMinFilter.Linear);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureMagFilter, (int)TextureMagFilter.Linear);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapS, (int)TextureWrapMode.ClampToEdge);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapT, (int)TextureWrapMode.ClampToEdge);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapR, (int)TextureWrapMode.ClampToEdge);
for (int i = 0; i <len; i++)
{
GL.TexSubImage3D(TextureTarget.Texture3D, 0, 0,0 , i, wdt, hgt, 1, PixelFormat.Red, PixelType.Float, stk[i].Bytes);
}
GL.BindImageTexture(0, inpuitimgae, 0, true, 0, TextureAccess.ReadOnly, SizedInternalFormat.R32f);
nlm_shader.Use();
nlm_shader.SetInt("input_imgTx", 0);
outputimg = GL.GenTexture();
GL.ActiveTexture(TextureUnit.Texture1);
GL.BindTexture(TextureTarget.Texture3D, outputimg);
GL.TexImage3D(TextureTarget.Texture3D, 0, PixelInternalFormat.R32f, wdt, hgt, len, 0, PixelFormat.Red, PixelType.Float, IntPtr.Zero);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureMinFilter, (int)TextureMinFilter.Linear);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureMagFilter, (int)TextureMagFilter.Linear);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapS, (int)TextureWrapMode.ClampToEdge);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapT, (int)TextureWrapMode.ClampToEdge);
GL.TexParameter(TextureTarget.Texture3D, TextureParameterName.TextureWrapR, (int)TextureWrapMode.ClampToEdge);
GL.BindImageTexture(1, outputimg, 0, true, 0, TextureAccess.WriteOnly, SizedInternalFormat.R32f);
nlm_shader.SetInt("output_imgTx", 1);
nlm_shader.Setivec3("size", new int[] { stk[0].Width, stk[0].Height, stk.Length });
nlm_shader.SetInt("block", 30);
float[] gauss_ker = make_square_gauss_kernel(3, 0.5);
nlm_shader.setFloatArray("gauss_ker[0]", gauss_ker);
GL.DispatchCompute(wdt/8+1, hgt/8+1, len/8+1);
GL.MemoryBarrier(MemoryBarrierFlags.AllBarrierBits);
GL.BindTexture(TextureTarget.Texture3D, outputimg);
GL.ActiveTexture(TextureUnit.Texture1);
float[] arrp1 = new float[wdt * hgt * len];
GL.GetTextureImage(outputimg, 0, PixelFormat.Red, PixelType.Float, wdt * hgt * len* 4, arrp1);
for (int i = 0; i < len; i++)
{
System.Buffer.BlockCopy(arrp1, i * wdt * hgt * 4, stk[i].ManagedArray, 0, wdt * hgt * 4);
}
base.OnLoad(e);
}
Upvotes: 1
Views: 105