Legopart -
Legopart -

Reputation: 11

How to send a buffers between 2 GPU's?

I trying to allocate buffer A and buffer B to GPU 0 and then allocate them (buffer A and buffer B) to GPU 1 and run them the a kernel that adds A + B and allocates it to buffer D (that I allocated before in GPU 1)

my issue is that it does not work in Python, PyCuda, or CuPy.

I am searching for some examples of it (PyCuda, or CuPy).

I tried this code in PyCuda, and that works, but I can't prove that it allocates the buffer on GPU1

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import time

cuda.init()

array_size = 1000000
h_a = np.random.randint(0, 101, size=array_size).astype(np.float32)
h_b = np.random.randint(0, 101, size=array_size).astype(np.float32)
h_c = np.random.randint(0, 101, size=array_size).astype(np.float32)

dev0 = cuda.Device(0)
ctx0 = dev0.make_context()
ctx0.push()
d_a_gpu0 = cuda.mem_alloc(h_a.nbytes)
d_b_gpu0 = cuda.mem_alloc(h_b.nbytes)
d_c_gpu0 = cuda.mem_alloc(h_c.nbytes)
cuda.memcpy_htod(d_a_gpu0, h_a)
cuda.memcpy_htod(d_b_gpu0, h_b)
cuda.memcpy_htod(d_c_gpu0, h_c)
ctx0.pop()

dev1 = cuda.Device(1)
ctx1 = dev1.make_context()
ctx1.push()
d_a_gpu1 = cuda.mem_alloc(h_a.nbytes)
d_b_gpu1 = cuda.mem_alloc(h_b.nbytes)
d_c_gpu1 = cuda.mem_alloc(h_c.nbytes)
ctx1.pop()

ctx0.push()
start_peer_transfer = time.time()
cuda.memcpy_peer(d_a_gpu1, dest_context=ctx1, src=d_a_gpu0, src_context=ctx0, size=h_a.nbytes)
cuda.memcpy_peer(d_b_gpu1, dest_context=ctx1, src=d_b_gpu0, src_context=ctx0, size=h_b.nbytes)
end_peer_transfer = time.time()
ctx0.pop()

kernel_code = """
__global__ void add_arrays(float *a, float *b, float *c)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < 1000000)
    {
        c[idx] = a[idx] + b[idx];
    }
}
"""

mod = SourceModule(kernel_code)
add_arrays = mod.get_function("add_arrays")

block_size = 256
grid_size = (array_size + block_size - 1) // block_size
ctx0.push()
start_gpu0_kernel = time.time()
add_arrays(d_a_gpu0, d_b_gpu0, d_c_gpu0, block=(block_size, 1, 1), grid=(grid_size, 1))
cuda.Context.synchronize()
end_gpu0_kernel = time.time()
ctx0.pop()

ctx1.push()
start_gpu1_kernel = time.time()
add_arrays(d_a_gpu1, d_b_gpu1, d_c_gpu1, block=(block_size, 1, 1), grid=(grid_size, 1))
cuda.Context.synchronize()
end_gpu1_kernel = time.time()
ctx1.pop()

h_c_result = np.empty_like(h_c)
h_d_result = np.empty_like(h_c)
ctx0.push()
cuda.memcpy_dtoh(h_c_result, d_c_gpu0)
ctx0.pop()

ctx1.push()
cuda.memcpy_dtoh(h_d_result, d_c_gpu1)
ctx1.pop()

comparison = np.allclose(h_c_result, h_d_result)
print("Results are identical:", comparison)

print(f"Time for cuda.memcpy_peer: {end_peer_transfer - start_peer_transfer:.6f} seconds")
print(f"Time for kernel execution on GPU 0: {end_gpu0_kernel - start_gpu0_kernel:.6f} seconds")
print(f"Time for kernel execution on GPU 1: {end_gpu1_kernel - start_gpu1_kernel:.6f} seconds")

d_a_gpu0.free()
d_b_gpu0.free()
d_c_gpu0.free()
d_a_gpu1.free()
d_b_gpu1.free()
d_c_gpu1.free()
ctx1.pop()
ctx0.pop()

For CuPy I tried this code that return an error on "deviceCanAccessPeer"

import cupy as cp
import numpy as np
import time

try:
    with cp.cuda.Device(0):
        A = cp.random.rand(1000)
        B = cp.random.rand(1000)
        C = cp.empty_like(A)
        
        start_time = time.time()
        
        C = A + B
        
        C_cpu = cp.asnumpy(C)
        
        print("Device 0 computation time:", time.time() - start_time)

    if cp.cuda.runtime.getDeviceCount() > 1:
        cp.cuda.runtime.deviceCanAccessPeer(1, 0)
        cp.cuda.runtime.deviceEnablePeerAccess(1)

    with cp.cuda.Device(1):
        D = cp.empty_like(A)
        
        start_time = time.time()
        
        A_device1 = cp.empty_like(A)
        B_device1 = cp.empty_like(B)
        
        cp.cuda.runtime.memcpyPeer(A_device1.data.ptr, 1, A.data.ptr, 0, A.nbytes)
        cp.cuda.runtime.memcpyPeer(B_device1.data.ptr, 1, B.data.ptr, 0, B.nbytes)
        
        D = A_device1 + B_device1
        
        D_cpu = cp.asnumpy(D)
        
        print("Device 1 computation time:", time.time() - start_time)

    comparison = np.array_equal(C_cpu, D_cpu)
    print("Comparison result:", comparison)

except cp.cuda.runtime.CUDARuntimeError as e:
    print(f"CUDA Runtime Error: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

Upvotes: 1

Views: 79

Answers (0)

Related Questions