Reputation: 1
I'm using Pycuda to perform a matrix multiplication, but it's not working.
I used the cuda code in a smaller problem and it worked fine so I have a feeling that the problem is in the code written in Python
kernel_code
__global__ void forward_propagation(float *a, float *w, float *b, float *z, int a_rows, int a_cols, int w_cols) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < a_rows && col < w_cols) {
float value = 0.0;
for (int i = 0; i < a_cols; i++) {
value += a[row * a_cols + i] * w[i * w_cols + col];
}
z[row * w_cols + col] = value + b[col];
}
}
python code
mod = compiler.SourceModule(kernel_code)
forward_propagation_gpu = mod.get_function("forward_propagation")
def forward_propagation(X, parameters, activation):
forward_cache = {}
L = len(parameters) // 2
forward_cache['A0'] = X
print(forward_cache['A0'].shape)
for l in range(1, L):
print(l)
A_prev = forward_cache['A' + str(l-1)]
W = parameters['W' + str(l)]
b = parameters['b' + str(l)]
# Dimensions
a_rows, a_cols = A_prev.shape
w_cols = W.shape[1]
# Allocate GPU memory
# Allocate memory on device
a_gpu = cuda.mem_alloc(A_prev.nbytes)
w_gpu = cuda.mem_alloc(W.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
z_gpu = cuda.mem_alloc(a_rows * w_cols * np.float32().nbytes)
# Copy matrices to device memory
cuda.memcpy_htod(a_gpu, A_prev)
cuda.memcpy_htod(w_gpu, W)
cuda.memcpy_htod(b_gpu, b)
# Define block and grid sizes
block_dim = (16, 16, 1) # 16 x 16
grid_dim = (int(np.ceil(w_cols / block_dim[0])), int(np.ceil(a_rows / block_dim[1])), 1)
# Run the CUDA kernel
forward_propagation_gpu(a_gpu, w_gpu, b_gpu, z_gpu, np.int32(a_rows), np.int32(a_cols), np.int32(w_cols), block=block_dim, grid=grid_dim)
cuda.Context.synchronize()
# Retrieve the result from the GPU
Z = np.empty((a_rows, w_cols), dtype=np.float32)
cuda.memcpy_dtoh(Z, z_gpu)
forward_cache['Z' + str(l)] = Z
if activation == 'tanh':
forward_cache['A' + str(l)] = tanh(forward_cache['Z' + str(l)])
else:
forward_cache['A' + str(l)] = relu(forward_cache['Z' + str(l)])
# forward_cache['Z' + str(L)] = parameters['W' + str(L)].dot(forward_cache['A' + str(L-1)]) + parameters['b' + str(L)]
if forward_cache['Z' + str(L)].shape[0] == 1:
forward_cache['A' + str(L)] = sigmoid(forward_cache['Z' + str(L)])
else :
forward_cache['A' + str(L)] = softmax(forward_cache['Z' + str(L)])
return forward_cache['A' + str(L)], forward_cache
if __name__ == '__main__':
X_train = sys.argv[1]
Y_train = sys.argv[2]
X_train = np.loadtxt(X_train, delimiter = ',').astype(np.float32) / 255.0
Y_train = np.loadtxt(Y_train, delimiter = ',').astype(np.float32).reshape(1, -1)
Y_train = Y_train.astype(np.float32)
layer_dim = [X_train.shape[0]]
for i in range(3, len(sys.argv)):
layer_dim.append(int(sys.argv[i]))
# X_test = np.loadtxt('dataset/cat_test_x.csv', delimiter = ',')/255.0
# Y_test = np.loadtxt('dataset/cat_test_y.csv', delimiter = ',').reshape(1, X_test.shape[1])
lr = 0.0075
iters = 1000
parameters = model(X_train, Y_train, layer_dim, lr, activation = 'relu', num_iteration = iters)
The error:
PS C:\workspace\TI6> python dnn.py X_train.csv Y_train.csv 30 30 30 1
...
File "C:\\workspace\\TI6\\dnn.py", line 165, in forward_propagation
cuda.Context.synchronize()
pycuda.\_driver.LogicError: cuCtxSynchronize failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
I tried changing "mem_alloc" to "gpuarray.to_gpu". didn't work.
I tried changing the python code to simpler one and it worked.
I tried changing X_train to others types and it didn't work.
I tried copying other approach in stackoverflow, but I really think that the problem is in Python code
Upvotes: 0
Views: 25