Reputation: 11
I am trying to use PyCuda to convolve a Gaussian filter with an image. I've taken some code from the PyCuda documentation and a Cuda convolution kernel from a page online. For some reason, the resulting image comes out completely black. I believe that the image array and Gaussian filter array are being passed in incorrectly - when I try to use printf to print values from within the kernel, the values of the image are just '0.00...' and the values of the filter are very big numbers like '125529009160192000.000000'.
I've tried flattening the arrays and explicitly setting them to C order, but this doesn't seem to help. I've also tried playing around with PyCuda GPUarrays, but haven't had any success.
Thanks for taking a look!
Here is my code:
import pycuda.driver as cuda
import pycuda.autoinit
import math
from pycuda.compiler import SourceModule
from timeit import default_timer as timer
from PIL import Image
import numpy as np
def make_k(sig):
s = 65
out = np.zeros((s,s))
for x in range(s):
for y in range(s):
X = x-(s-1)/2
Y = y-(s-1)/2
gauss = 1/(2*np.pi*sig**2) * np.exp(-(X**2 + Y**2)/(2*sig**2))
out[x,y] = gauss
a = np.sum(out)
kernel = out/a
return kernel
def replication_pad(img, W, H, S, paddedW, paddedH):
output = np.zeros((paddedH, paddedW))
output[:S, S:W+S] = img[0:1,:]
output[S:H+S, :S] = img[:, 0:1]
output[H+S:, S:W+S] = img[-1:,:]
output[S:H+S, W+S:] = img[:, -1:]
output[:S, :S] = img[0, 0]
output[:S, paddedW-S:] = img[0, -1]
output[paddedH-S:, :S] = img[-1, 0]
output[paddedH-S:, paddedW-S:] = img[-1, -1]
output[S:H+S, S:W+S] = img
return output
#d_f is the padded image
#d_g is the filter
#d_h is the filtering result
mod = SourceModule("""
__global__ void convolution( const float *d_f, const unsigned int paddedW, const unsigned int paddedH,
const float *d_g, const int S,
float *d_h, const unsigned int W, const unsigned int H )
{
// Set the padding size and filter size
unsigned int paddingSize = S;
unsigned int filterSize = 2 * S + 1;
// Set the pixel coordinate
const unsigned int j = blockIdx.x * blockDim.x + threadIdx.x + paddingSize;
const unsigned int i = blockIdx.y * blockDim.y + threadIdx.y + paddingSize;
// Print for debugging (on the first thread)
if( i==paddingSize && j==paddingSize) {
//printf("%lf", d_g[50]);
printf("%lf", d_f[100400]);
}
// The multiply-add operation for the pixel coordinate ( j, i )
if( j >= paddingSize && j < paddedW - paddingSize && i >= paddingSize && i < paddedH - paddingSize ) {
unsigned int oPixelPos = ( i - paddingSize ) * W + ( j - paddingSize );
d_h[oPixelPos] = 0.0;
for( int k = -S; k <=S; k++ ) {
for( int l = -S; l <= S; l++ ) {
unsigned int iPixelPos = ( i + k ) * paddedW + ( j + l );
unsigned int coefPos = ( k + S ) * filterSize + ( l + S );
d_h[oPixelPos] += d_f[iPixelPos] * d_g[coefPos];
}
}
}
}
""")
image = Image.open('spooky.jpg').convert('L')
img_full = np.asarray(image, dtype='float')
img = img_full[:1080,:1920] # 1080p resolution
W = 1920
H = 1080
S = 32
paddedW = W + 2*S
paddedH = H + 2*S
img_padded = replication_pad(img, W, H, S, paddedW, paddedH)
kernel = make_k(10)
ker_cont = np.ascontiguousarray(kernel, dtype="float")
ker_gpu = cuda.mem_alloc(ker_cont.nbytes)
cuda.memcpy_htod(ker_gpu, ker_cont)
img_cont = np.ascontiguousarray(img_padded)
img_gpu = cuda.mem_alloc(img_cont.nbytes)
cuda.memcpy_htod(img_gpu, img_cont)
img_og = np.ascontiguousarray(img)
result_gpu = cuda.mem_alloc(img_og.nbytes)
blockW = 32
blockH = 32
gridW = math.ceil(W/blockW)
gridH = math.ceil(H/blockH)
func = mod.get_function("convolution")
func(img_gpu, np.int_(paddedW), np.int_(paddedH), ker_gpu, np.int_(S), result_gpu, np.int_(W), np.int_(H), block = (blockW, blockH, 1), grid=(gridW, gridH))
host_output = np.empty_like(img_og)
cuda.memcpy_dtoh(host_output, result_gpu)
Image.fromarray(host_output).show()
And here is the image I'm using: https://i.sstatic.net/8YvA6.jpg
Upvotes: 0
Views: 297
Reputation: 11
I needed to change dtypes of input image and input kernel from float64 to float32. Also needed to allocate output array with reference to float32 array for appropriate nbytes. This looked like:
ker_cont = np.float32(ker_cont)
img_cont = np.float32(img_cont)
img_og = np.float32(img_og)
result_gpu = cuda.mem_alloc(img_og.nbytes)
Upvotes: 1