Passing arrays into PyCuda convolution kernel producing unexpected behavior

Question

I am trying to use PyCuda to convolve a Gaussian filter with an image. I've taken some code from the PyCuda documentation and a Cuda convolution kernel from a page online. For some reason, the resulting image comes out completely black. I believe that the image array and Gaussian filter array are being passed in incorrectly - when I try to use printf to print values from within the kernel, the values of the image are just '0.00...' and the values of the filter are very big numbers like '125529009160192000.000000'.

I've tried flattening the arrays and explicitly setting them to C order, but this doesn't seem to help. I've also tried playing around with PyCuda GPUarrays, but haven't had any success.

Thanks for taking a look!

Here is my code:

import pycuda.driver as cuda
import pycuda.autoinit
import math
from pycuda.compiler import SourceModule
from timeit import default_timer as timer
from PIL import Image
import numpy as np

def make_k(sig):
    s = 65
    out = np.zeros((s,s))
    for x in range(s):
        for y in range(s):
            X = x-(s-1)/2
            Y = y-(s-1)/2
            gauss = 1/(2*np.pi*sig**2) * np.exp(-(X**2 + Y**2)/(2*sig**2))
            out[x,y] = gauss
    a = np.sum(out)
    kernel = out/a
    return kernel

def replication_pad(img, W, H, S, paddedW, paddedH):
    output = np.zeros((paddedH, paddedW))
    output[:S, S:W+S] = img[0:1,:]
    output[S:H+S, :S] = img[:, 0:1]
    output[H+S:, S:W+S] = img[-1:,:]
    output[S:H+S, W+S:] = img[:, -1:]

    output[:S, :S] = img[0, 0]
    output[:S, paddedW-S:] = img[0, -1]
    output[paddedH-S:, :S] = img[-1, 0]
    output[paddedH-S:, paddedW-S:] = img[-1, -1]

    output[S:H+S, S:W+S] = img
    return output


#d_f is the padded image
#d_g is the filter
#d_h is the filtering result

mod = SourceModule("""
__global__ void convolution( const float *d_f, const unsigned int paddedW, const unsigned int paddedH,
                                      const float *d_g, const int S,
                                      float *d_h, const unsigned int W, const unsigned int H )
{   
    // Set the padding size and filter size
    unsigned int paddingSize = S;
    unsigned int filterSize = 2 * S + 1;

    // Set the pixel coordinate
    const unsigned int j = blockIdx.x * blockDim.x + threadIdx.x + paddingSize;
    const unsigned int i = blockIdx.y * blockDim.y + threadIdx.y + paddingSize;

    // Print for debugging (on the first thread)
    if( i==paddingSize && j==paddingSize) {
        //printf("%lf", d_g[50]);
        printf("%lf", d_f[100400]);
    }

    // The multiply-add operation for the pixel coordinate ( j, i )
    if( j >= paddingSize && j < paddedW - paddingSize && i >= paddingSize && i < paddedH - paddingSize ) {
        unsigned int oPixelPos = ( i - paddingSize ) * W + ( j - paddingSize );
        d_h[oPixelPos] = 0.0;
        for( int k = -S; k <=S; k++ ) {
            for( int l = -S; l <= S; l++ ) {
                unsigned int iPixelPos = ( i + k ) * paddedW + ( j + l );
                unsigned int coefPos = ( k + S ) * filterSize + ( l + S );
                d_h[oPixelPos] += d_f[iPixelPos] * d_g[coefPos];
            }
        }
    }

}
""")

image = Image.open('spooky.jpg').convert('L')
img_full = np.asarray(image, dtype='float')
img = img_full[:1080,:1920] # 1080p resolution
W = 1920
H = 1080

S = 32
paddedW = W + 2*S
paddedH = H + 2*S

img_padded = replication_pad(img, W, H, S, paddedW, paddedH)

kernel = make_k(10)
ker_cont = np.ascontiguousarray(kernel, dtype="float")
ker_gpu = cuda.mem_alloc(ker_cont.nbytes)
cuda.memcpy_htod(ker_gpu, ker_cont)

img_cont = np.ascontiguousarray(img_padded)
img_gpu = cuda.mem_alloc(img_cont.nbytes)
cuda.memcpy_htod(img_gpu, img_cont)

img_og = np.ascontiguousarray(img)
result_gpu = cuda.mem_alloc(img_og.nbytes)

blockW = 32
blockH = 32
gridW = math.ceil(W/blockW)
gridH = math.ceil(H/blockH)

func = mod.get_function("convolution")
func(img_gpu, np.int_(paddedW), np.int_(paddedH), ker_gpu, np.int_(S), result_gpu, np.int_(W), np.int_(H), block = (blockW, blockH, 1), grid=(gridW, gridH))

host_output = np.empty_like(img_og)
cuda.memcpy_dtoh(host_output, result_gpu)

Image.fromarray(host_output).show()

And here is the image I'm using: https://i.sstatic.net/8YvA6.jpg

Passing arrays into PyCuda convolution kernel producing unexpected behavior

Answers (1)

Related Questions