Save Time needed for cudaHostAlloc

Question

I'm trying to find out if it makes sense to copy data to pinned memory before transferring it to the device since I have no influence on the allocation of my input data (it's a lib).

std::vector idata(WORK_SIZE);
int *idata_aligned = NULL;
int *d1 = NULL;
int *d2 = NULL;

for (int i = 0; i < WORK_SIZE; ++i)
    idata[i] = i;
CUDA_CHECK_RETURN(cudaMalloc((void**) &d1, sizeof(int) * WORK_SIZE));
CUDA_CHECK_RETURN(cudaMalloc((void**) &d2, sizeof(int) * WORK_SIZE));

printf("unpinned:
");
{
    boost::timer::auto_cpu_timer t;
    CUDA_CHECK_RETURN(cudaMemcpy(d1, &idata[0], sizeof(int) * WORK_SIZE, cudaMemcpyHostToDevice));
}

printf("copy to pinned:
");
{
    boost::timer::auto_cpu_timer t;
    CUDA_CHECK_RETURN(cudaHostAlloc((void**) &idata_aligned, sizeof(int) * WORK_SIZE,cudaHostAllocDefault));
    memcpy(idata_aligned, &idata[0], sizeof(int) * WORK_SIZE);
    CUDA_CHECK_RETURN(cudaMemcpy(d2, idata_aligned, sizeof(int) * WORK_SIZE, cudaMemcpyHostToDevice));
}

Output for 10,000,000 elements:

unpinned:
 0.018919s wall, 0.020000s user + 0.000000s system = 0.020000s CPU (105.7%)
copy to pinned:
 0.045428s wall, 0.020000s user + 0.020000s system = 0.040000s CPU (88.1%)

The main problem seems to be the cudaHostAlloc (even without memcpy the second approach is much slower).

Did I do something wrong? Is there another way to use pinned memory for already allocated memory?

Save Time needed for cudaHostAlloc

Answers (1)

Related Questions