Matrix multiplication using Pycuda

Question

I'm using Pycuda to perform a matrix multiplication, but it's not working.
I used the cuda code in a smaller problem and it worked fine so I have a feeling that the problem is in the code written in Python

kernel_code

__global__ void forward_propagation(float *a, float *w, float *b, float *z, int a_rows, int a_cols, int w_cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < a_rows && col < w_cols) {
        float value = 0.0;
        for (int i = 0; i < a_cols; i++) {
            value += a[row * a_cols + i] * w[i * w_cols + col];
        }
        z[row * w_cols + col] = value + b[col];
    }
}

python code

mod = compiler.SourceModule(kernel_code)
forward_propagation_gpu = mod.get_function("forward_propagation")

def forward_propagation(X, parameters, activation):
   
    forward_cache = {}
    L = len(parameters) // 2        
    
    forward_cache['A0'] = X
    print(forward_cache['A0'].shape)
    for l in range(1, L):
        print(l)
        A_prev = forward_cache['A' + str(l-1)]
        W = parameters['W' + str(l)]
        b = parameters['b' + str(l)]
        
        # Dimensions
        a_rows, a_cols = A_prev.shape
        w_cols = W.shape[1]
        
        # Allocate GPU memory
        # Allocate memory on device
        a_gpu = cuda.mem_alloc(A_prev.nbytes)
        w_gpu = cuda.mem_alloc(W.nbytes)
        b_gpu = cuda.mem_alloc(b.nbytes)
        z_gpu = cuda.mem_alloc(a_rows * w_cols * np.float32().nbytes)

        # Copy matrices to device memory
        cuda.memcpy_htod(a_gpu, A_prev)
        cuda.memcpy_htod(w_gpu, W)
        cuda.memcpy_htod(b_gpu, b)
        
        # Define block and grid sizes
        block_dim = (16, 16, 1)  # 16 x 16
        grid_dim = (int(np.ceil(w_cols / block_dim[0])), int(np.ceil(a_rows / block_dim[1])), 1)
        
        # Run the CUDA kernel
        forward_propagation_gpu(a_gpu, w_gpu, b_gpu, z_gpu, np.int32(a_rows), np.int32(a_cols), np.int32(w_cols), block=block_dim, grid=grid_dim)
        cuda.Context.synchronize()
        # Retrieve the result from the GPU
        Z = np.empty((a_rows, w_cols), dtype=np.float32)
        cuda.memcpy_dtoh(Z, z_gpu)
        forward_cache['Z' + str(l)] = Z

        if activation == 'tanh':
            forward_cache['A' + str(l)] = tanh(forward_cache['Z' + str(l)])
        else:
            forward_cache['A' + str(l)] = relu(forward_cache['Z' + str(l)])
            

    # forward_cache['Z' + str(L)] = parameters['W' + str(L)].dot(forward_cache['A' + str(L-1)]) + parameters['b' + str(L)]
    
    if forward_cache['Z' + str(L)].shape[0] == 1:
        forward_cache['A' + str(L)] = sigmoid(forward_cache['Z' + str(L)])
    else :
        forward_cache['A' + str(L)] = softmax(forward_cache['Z' + str(L)])
    
    return forward_cache['A' + str(L)], forward_cache

if __name__ == '__main__':
    X_train = sys.argv[1]
    Y_train = sys.argv[2]
    X_train = np.loadtxt(X_train, delimiter = ',').astype(np.float32) / 255.0
    Y_train = np.loadtxt(Y_train, delimiter = ',').astype(np.float32).reshape(1, -1)
    Y_train = Y_train.astype(np.float32)
    layer_dim = [X_train.shape[0]]
    for i in range(3, len(sys.argv)):
        layer_dim.append(int(sys.argv[i]))
    # X_test = np.loadtxt('dataset/cat_test_x.csv', delimiter = ',')/255.0
    # Y_test = np.loadtxt('dataset/cat_test_y.csv', delimiter = ',').reshape(1, X_test.shape[1])
    lr = 0.0075
    iters = 1000
    parameters = model(X_train, Y_train, layer_dim, lr, activation = 'relu', num_iteration = iters)

The error:

PS C:\workspace\TI6> python dnn.py X_train.csv Y_train.csv 30 30 30 1

...

File "C:\workspace\TI6\dnn.py", line 165, in forward_propagation

cuda.Context.synchronize()

pycuda.\_driver.LogicError: cuCtxSynchronize failed: an illegal memory access was encountered

PyCUDA WARNING: a clean-up operation failed (dead context maybe?)

cuMemFree failed: an illegal memory access was encountered

I tried changing "mem_alloc" to "gpuarray.to_gpu". didn't work.
I tried changing the python code to simpler one and it worked.
I tried changing X_train to others types and it didn't work.
I tried copying other approach in stackoverflow, but I really think that the problem is in Python code

Matrix multiplication using Pycuda

Answers (0)

Related Questions