Reputation: 11
I trying to allocate buffer A and buffer B to GPU 0 and then allocate them (buffer A and buffer B) to GPU 1 and run them the a kernel that adds A + B and allocates it to buffer D (that I allocated before in GPU 1)
my issue is that it does not work in Python, PyCuda, or CuPy.
I am searching for some examples of it (PyCuda, or CuPy).
I tried this code in PyCuda, and that works, but I can't prove that it allocates the buffer on GPU1
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import time
cuda.init()
array_size = 1000000
h_a = np.random.randint(0, 101, size=array_size).astype(np.float32)
h_b = np.random.randint(0, 101, size=array_size).astype(np.float32)
h_c = np.random.randint(0, 101, size=array_size).astype(np.float32)
dev0 = cuda.Device(0)
ctx0 = dev0.make_context()
ctx0.push()
d_a_gpu0 = cuda.mem_alloc(h_a.nbytes)
d_b_gpu0 = cuda.mem_alloc(h_b.nbytes)
d_c_gpu0 = cuda.mem_alloc(h_c.nbytes)
cuda.memcpy_htod(d_a_gpu0, h_a)
cuda.memcpy_htod(d_b_gpu0, h_b)
cuda.memcpy_htod(d_c_gpu0, h_c)
ctx0.pop()
dev1 = cuda.Device(1)
ctx1 = dev1.make_context()
ctx1.push()
d_a_gpu1 = cuda.mem_alloc(h_a.nbytes)
d_b_gpu1 = cuda.mem_alloc(h_b.nbytes)
d_c_gpu1 = cuda.mem_alloc(h_c.nbytes)
ctx1.pop()
ctx0.push()
start_peer_transfer = time.time()
cuda.memcpy_peer(d_a_gpu1, dest_context=ctx1, src=d_a_gpu0, src_context=ctx0, size=h_a.nbytes)
cuda.memcpy_peer(d_b_gpu1, dest_context=ctx1, src=d_b_gpu0, src_context=ctx0, size=h_b.nbytes)
end_peer_transfer = time.time()
ctx0.pop()
kernel_code = """
__global__ void add_arrays(float *a, float *b, float *c)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1000000)
{
c[idx] = a[idx] + b[idx];
}
}
"""
mod = SourceModule(kernel_code)
add_arrays = mod.get_function("add_arrays")
block_size = 256
grid_size = (array_size + block_size - 1) // block_size
ctx0.push()
start_gpu0_kernel = time.time()
add_arrays(d_a_gpu0, d_b_gpu0, d_c_gpu0, block=(block_size, 1, 1), grid=(grid_size, 1))
cuda.Context.synchronize()
end_gpu0_kernel = time.time()
ctx0.pop()
ctx1.push()
start_gpu1_kernel = time.time()
add_arrays(d_a_gpu1, d_b_gpu1, d_c_gpu1, block=(block_size, 1, 1), grid=(grid_size, 1))
cuda.Context.synchronize()
end_gpu1_kernel = time.time()
ctx1.pop()
h_c_result = np.empty_like(h_c)
h_d_result = np.empty_like(h_c)
ctx0.push()
cuda.memcpy_dtoh(h_c_result, d_c_gpu0)
ctx0.pop()
ctx1.push()
cuda.memcpy_dtoh(h_d_result, d_c_gpu1)
ctx1.pop()
comparison = np.allclose(h_c_result, h_d_result)
print("Results are identical:", comparison)
print(f"Time for cuda.memcpy_peer: {end_peer_transfer - start_peer_transfer:.6f} seconds")
print(f"Time for kernel execution on GPU 0: {end_gpu0_kernel - start_gpu0_kernel:.6f} seconds")
print(f"Time for kernel execution on GPU 1: {end_gpu1_kernel - start_gpu1_kernel:.6f} seconds")
d_a_gpu0.free()
d_b_gpu0.free()
d_c_gpu0.free()
d_a_gpu1.free()
d_b_gpu1.free()
d_c_gpu1.free()
ctx1.pop()
ctx0.pop()
For CuPy I tried this code that return an error on "deviceCanAccessPeer"
import cupy as cp
import numpy as np
import time
try:
with cp.cuda.Device(0):
A = cp.random.rand(1000)
B = cp.random.rand(1000)
C = cp.empty_like(A)
start_time = time.time()
C = A + B
C_cpu = cp.asnumpy(C)
print("Device 0 computation time:", time.time() - start_time)
if cp.cuda.runtime.getDeviceCount() > 1:
cp.cuda.runtime.deviceCanAccessPeer(1, 0)
cp.cuda.runtime.deviceEnablePeerAccess(1)
with cp.cuda.Device(1):
D = cp.empty_like(A)
start_time = time.time()
A_device1 = cp.empty_like(A)
B_device1 = cp.empty_like(B)
cp.cuda.runtime.memcpyPeer(A_device1.data.ptr, 1, A.data.ptr, 0, A.nbytes)
cp.cuda.runtime.memcpyPeer(B_device1.data.ptr, 1, B.data.ptr, 0, B.nbytes)
D = A_device1 + B_device1
D_cpu = cp.asnumpy(D)
print("Device 1 computation time:", time.time() - start_time)
comparison = np.array_equal(C_cpu, D_cpu)
print("Comparison result:", comparison)
except cp.cuda.runtime.CUDARuntimeError as e:
print(f"CUDA Runtime Error: {e}")
except Exception as e:
print(f"An error occurred: {e}")
Upvotes: 1
Views: 79