Reputation: 55
Let's say we keep a 100MB global memory buffer for a cuda operation alive. In the next calculation, the results are larger than 100MB, e.g. 120MB, and we know this number before the real results come out. In this case, we need to reallocate this memory fragment to 120MB. The question is, can we just create a new memory of 20MB and concatenate it to the existing 100MB? So that we can avoid the latency of a FREE of a 100 MB fragment and a MALLOC of a 120 MB fragment.
Upvotes: 0
Views: 2311
Reputation: 2588
In addition to Robert Crovella's answer I would like to show that memory pool allocations via cudaMallocAsync
can reduce the latency.
The following code grows an allocation 50 times by 20 MB by simply freeing the buffer and allocating a larger one. The next buffer size is computed via kernel to emulate computation of result size of an operation.
// compiled with nvcc -O3 main.cu -o main
#include <iostream>
#include <chrono>
#if 0
cudaError_t allocate(void** ptr, size_t size, cudaStream_t stream){
return cudaMallocAsync(ptr, size, stream);
}
cudaError_t deallocate(void* ptr, cudaStream_t stream){
return cudaFreeAsync(ptr, stream);
}
#else
cudaError_t allocate(void** ptr, size_t size, cudaStream_t){
return cudaMalloc(ptr, size);
}
cudaError_t deallocate(void* ptr, cudaStream_t){
return cudaFree(ptr);
}
#endif
__global__
void computeNextSize(size_t* size, size_t growBy){
*size = *size + growBy;
}
int main(){
cudaSetDevice(0);
cudaStream_t stream = cudaStreamPerThread;
const size_t mb20 = 20 * 1024 * 1024;
size_t* h_size = nullptr;
size_t* d_size = nullptr;
cudaMalloc(&d_size, sizeof(size_t));
cudaMallocHost(&h_size, sizeof(size_t));
*h_size = mb20;
cudaMemcpyAsync(d_size, h_size, sizeof(size_t), cudaMemcpyHostToDevice, stream);
cudaMemPool_t memPool;
cudaDeviceGetMemPool(&memPool, 0);
size_t setVal = UINT64_MAX;
cudaMemPoolSetAttribute(memPool, cudaMemPoolAttrReleaseThreshold, &setVal);
void* ptr = nullptr;
size_t size = mb20;
allocate(&ptr, size, stream);
//std::cout << "size: " << size << ", ptr = " << ptr << "\n";
cudaMemsetAsync(ptr, 0, size); //work with ptr
computeNextSize<<<1,1,0,stream>>>(d_size, mb20);
cudaMemcpyAsync(h_size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream); //wait for computation and transfer of next size
auto a = std::chrono::system_clock::now();
for(int i = 0; i < 50; i++){
deallocate(ptr, stream);
size = *h_size;
allocate(&ptr, size, stream);
//std::cout << "size: " << size << ", ptr = " << ptr << "\n";
cudaMemsetAsync(ptr, 0, size); //work with ptr
computeNextSize<<<1,1,0,stream>>>(d_size, mb20);
cudaMemcpyAsync(h_size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream); //wait for computation and transfer of next size
}
deallocate(ptr, stream);
cudaStreamSynchronize(stream);
auto b = std::chrono::system_clock::now();
const std::chrono::duration<double> delta = b - a;
std::cerr << delta.count() << "s\n";
}
With cudaMalloc
the code takes around 0.596s,
with cudaMallocAsync
the duration is around 0.092s
Upvotes: 1
Reputation: 151799
The question is, can we just create a new memory of 20MB and concatenate it to the existing 100MB?
You can't do this with cudaMalloc
, cudaMallocManaged
, or cudaHostAlloc
. The reason is that there is no way to ensure that the 20MB allocation/pointer will be contiguous to the previous 100MB allocation, and there is no way to request that or force that using those allocation APIs.
And any "buffer-like" usage of the 120MB space is going to expect that the space is contiguous.
As indicated in the comments (below), the Virtual Memory Management API is intended to allow for roughly this.
You could of course write your own allocation system, that starts by allocating a large contiguous space, and then gives you the controls to do what you are suggesting - basically ask for the "next" 20MB. How to do that is beyond the scope of my answer.
I believe another possible suggestion is:
Yes, I understand the usual objection: "but I don't know what the largest size may be". That is understood. The above mechanism accounts for that. But if you really cannot provide any sort of "upper bound" based on your knowledge of what you are doing, then there is no guarantee that you will have enough GPU memory anyway, when you discover that more space is needed.
The basic steps to follow using the virtual memory management method are as follows:
cuMemCreate
, a physical device allocation, say of 100MBcuMemAddressReserve
cuMemMap
and make it accessible using cuMemSetAccess
cuMemCreate
cuMemMap
and make it accessible using cuMemSetAccess
Here is an example following that sequence:
$ cat t2075.cu
#include <cuda.h>
#include <iostream>
#include <cassert>
__global__ void fill(unsigned char *d, size_t size, unsigned char val){
for (size_t i = blockIdx.x*blockDim.x + threadIdx.x; i < size; i += blockDim.x*gridDim.x)
d[i] = val;
}
size_t ROUND_UP(size_t size, size_t granularity){
size_t mult = ceil(size/(double)granularity);
return mult*granularity;
}
CUmemGenericAllocationHandle allocatePhysicalMemory(int device, size_t size) {
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = device;
size_t granularity;
CUresult result = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
assert(result == CUDA_SUCCESS);
// Ensure size matches granularity requirements for the allocation
size_t padded_size = ROUND_UP(size, granularity);
// Allocate physical memory
CUmemGenericAllocationHandle allocHandle;
result = cuMemCreate(&allocHandle, padded_size, &prop, 0);
assert(result == CUDA_SUCCESS);
return allocHandle;
}
void setAccessOnDevice(int device, CUdeviceptr ptr, size_t size) {
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = device;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
// Make the address accessible
CUresult result = cuMemSetAccess(ptr, size, &accessDesc, 1);
assert(result == CUDA_SUCCESS);
}
const size_t first_chunk_size = 100ULL*1048576;
const size_t second_chunk_size = 20ULL*1048576;
const size_t total_size = first_chunk_size+second_chunk_size;
int main(){
// test for support
CUdevice device;
int device_id = 0;
cuInit(0);
CUresult result = cuDeviceGet(&device, device_id);
assert(result == CUDA_SUCCESS);
int deviceSupportsVmm = 0;
result = cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device);
assert(result == CUDA_SUCCESS);
if (deviceSupportsVmm == 0) {
std::cout << "Not supported!" << std::endl;
return 0;
}
// `device` supports Virtual Memory Management
// allocate first chunk
CUmemGenericAllocationHandle h1 = allocatePhysicalMemory(device_id, first_chunk_size);
CUdeviceptr ptr;
// `ptr` holds the returned start of virtual address range reserved.
result = cuMemAddressReserve(&ptr, total_size, 0, 0, 0); // alignment = 0 for default alignment
assert(result == CUDA_SUCCESS);
// `ptr`: address in the address range previously reserved by cuMemAddressReserve.
// `allocHandle`: CUmemGenericAllocationHandle obtained by a previous call to cuMemCreate.
result = cuMemMap(ptr, first_chunk_size, 0, h1, 0);
assert(result == CUDA_SUCCESS);
// enable access to first chunk
setAccessOnDevice(device_id, ptr, first_chunk_size);
unsigned char *d = (unsigned char *)ptr;
// test access to first chunk
fill<<<80, 1024>>>(d, first_chunk_size, 0);
cudaError_t err = cudaGetLastError();
assert(err == cudaSuccess);
err = cudaDeviceSynchronize();
//allocate second chunk
CUmemGenericAllocationHandle h2 = allocatePhysicalMemory(device_id, second_chunk_size);
// map second chunk
result = cuMemMap(ptr+first_chunk_size, second_chunk_size, 0, h2, 0);
assert(result == CUDA_SUCCESS);
// enable access to second chunk
setAccessOnDevice(device_id, ptr+first_chunk_size, second_chunk_size);
// test access to both chunks
fill<<<80, 1024>>>(d, total_size, 0);
err = cudaGetLastError();
assert(err == cudaSuccess);
err = cudaDeviceSynchronize();
assert(err == cudaSuccess);
}
$ nvcc -o t2075 t2075.cu -lcuda
$ compute-sanitizer ./t2075
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$
CUDA 11.4, V100, CentOS 7
A few observations:
Upvotes: 2