Reputation: 3
I observe cudaFree operation takes unacceptably long time to operate it. To verify it, I wrote a simple micro benchmark code to test cudaFree overhead. It shows the similar trend.
first step: cudaMallocManaged
It doesn't take that long time. Actually it is less than 1ms just only for cudaMallocManaged.
second step: init
To initialize malloced memory region, init function is called. And it also doesn't take less than 1ms.
third step: cudaFree
Now, the overhead appears. It takes a lot of time. One more things is bigger gets the memory size, Longer it takes. (beautifully linearly)
Question is "Why does cudaFree have such a huge overhead? Does it fill whole memory region with zero for security issue? Or any other critical path it walks through?"
Here is the code and the measured result. Thank you in advance :) !!!
10 int getMilliCount(){
11 timeb tb;
12 ftime(&tb);
13 int nCount = tb.millitm + (tb.time & 0xfffff) * 1000;
14 return nCount;
15 }
16
17 int getTimeDiff(int baseTime){
18 int diff = getMilliCount() - baseTime;
19 return diff;
20 }
21
22 __global__ void init(int* x, size_t bytes_){
23 int num_ = bytes_/sizeof(int);
24 for (int i=0; i<num_; i++){
25 x[i] = i;
26 }
27 }
28
29 int main(){
30 printf("sizeof(size_t): %zu\n", sizeof(size_t));
31 printf("sizeof(unsigned int): %zu\n", sizeof(unsigned int));
32 printf("sizeof(int): %zu\n", sizeof(int));
33 printf("sizeof(long): %zu\n", sizeof(long));
34
35 std::ofstream myfile;
36 myfile.open("output3.csv");
37 myfile<<"operation, num_bytes, start, end, duration\n";
38 int baseTime = getMilliCount();
39 int* dptr;
40 int ts1 = 0;
41 int ts2 = 0;
42 size_t KB = 1024; // start from 1KB
43 int num_trial_ = 1;
44 for (int j=10; j<25; j++){
45 size_t num_bytes_ = KB<<j;
46 for (int i=0; i<num_trial_; i++){
47 // measuring cudaMallocManaged
48 ts1 = getTimeDiff(baseTime);
>> 49 cudaMallocManaged((void**)&dptr, num_bytes_);
50 ts2 = getTimeDiff(baseTime);
51 myfile<<"cudaMallocManaged, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
52 //printf("cudaMallocManaged, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
53 printf("cudaMallocManaged, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
54
55 // measuring initialization
56 ts1 = getTimeDiff(baseTime);
>> 57 init<<<1,1>>>(dptr, num_bytes_);
58 ts2 = getTimeDiff(baseTime);
59 myfile<<"initialization, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
60 //printf("init, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
61 printf("init, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
62
63 // measuring cudaFree
64 ts1 = getTimeDiff(baseTime);
>> 65 cudaFree(dptr);
66 ts2 = getTimeDiff(baseTime);
67 myfile<<"cudaFree, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
68 //printf("cudaFree, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
69 printf("cudaFree, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
70 sleep(1);
71 printf("\n");
72 }
73 }
74 myfile.close();
75 return 1;
76 }
Results
cudaMallocManaged, memory_size:1MB, duration:360
init, memory_size:1MB, duration:0
cudaFree, memory_size:1MB, **duration:2**
cudaMallocManaged, memory_size:2MB, duration:1
init, memory_size:2MB, duration:0
cudaFree, memory_size:2MB, **duration:4**
cudaMallocManaged, memory_size:4MB, duration:0
init, memory_size:4MB, duration:0
cudaFree, memory_size:4MB, **duration:9**
cudaMallocManaged, memory_size:8MB, duration:0
init, memory_size:8MB, duration:0
cudaFree, memory_size:8MB, **duration:18**
cudaMallocManaged, memory_size:16MB, duration:0
init, memory_size:16MB, duration:0
cudaFree, memory_size:16MB, **duration:34**
cudaMallocManaged, memory_size:32MB, duration:0
init, memory_size:32MB, duration:0
cudaFree, memory_size:32MB, **duration:69**
cudaMallocManaged, memory_size:64MB, duration:0
init, memory_size:64MB, duration:0
cudaFree, memory_size:64MB, **duration:132**
cudaMallocManaged, memory_size:128MB, duration:0
init, memory_size:128MB, duration:0
cudaFree, memory_size:128MB, **duration:241**
cudaMallocManaged, memory_size:256MB, duration:0
init, memory_size:256MB, duration:0
cudaFree, memory_size:256MB, **duration:476**
cudaMallocManaged, memory_size:512MB, duration:0
init, memory_size:512MB, duration:0
cudaFree, memory_size:512MB, **duration:984**
cudaMallocManaged, memory_size:1024MB, duration:0
init, memory_size:1024MB, duration:0
cudaFree, memory_size:1024MB, **duration:1910**
cudaMallocManaged, memory_size:2048MB, duration:0
init, memory_size:2048MB, duration:1
cudaFree, memory_size:2048MB, **duration:3830**
cudaMallocManaged, memory_size:4096MB, duration:0
init, memory_size:4096MB, duration:0
cudaFree, memory_size:4096MB, **duration:7715**
cudaMallocManaged, memory_size:8192MB, duration:0
init, memory_size:8192MB, duration:0
cudaFree, memory_size:8192MB, **duration:0**
cudaMallocManaged, memory_size:16384MB, duration:0
init, memory_size:16384MB, duration:0
cudaFree, memory_size:16384MB, **duration:0**
Please enlighten me
Upvotes: 0
Views: 644
Reputation: 72372
Why does cudaFree have such a huge overhead?
It doesn't. The timing method you are using is incorrect and the time you attributed to cudaFree
comes from prior asynchronous operations
Does it fill whole memory region with zero for security issue?
No
Or any other critical path it walks through?
No, except it requires the device to be idle, which it is not in your case.
Let's fix the most obvious problem with you code:
// measuring initialization
ts1 = getTimeDiff(baseTime);
init<<<1,1>>>(dptr, num_bytes_);
cudaDeviceSynchronize(); // Wait until the GPU is idle
ts2 = getTimeDiff(baseTime);
And then run your experiment:
$ nvcc -std=c++11 -o fliestime fliestime.cu
$ ./fliestime
sizeof(size_t): 8
sizeof(unsigned int): 4
sizeof(int): 4
sizeof(long): 8
cudaMallocManaged, memory_size:1MB, duration:102
init, memory_size:1MB, duration:2
cudaFree, memory_size:1MB, duration:1
cudaMallocManaged, memory_size:2MB, duration:0
init, memory_size:2MB, duration:5
cudaFree, memory_size:2MB, duration:0
cudaMallocManaged, memory_size:4MB, duration:1
init, memory_size:4MB, duration:8
cudaFree, memory_size:4MB, duration:0
cudaMallocManaged, memory_size:8MB, duration:1
init, memory_size:8MB, duration:17
cudaFree, memory_size:8MB, duration:1
cudaMallocManaged, memory_size:16MB, duration:1
init, memory_size:16MB, duration:33
cudaFree, memory_size:16MB, duration:1
cudaMallocManaged, memory_size:32MB, duration:3
init, memory_size:32MB, duration:65
cudaFree, memory_size:32MB, duration:2
cudaMallocManaged, memory_size:64MB, duration:5
init, memory_size:64MB, duration:121
cudaFree, memory_size:64MB, duration:4
cudaMallocManaged, memory_size:128MB, duration:9
init, memory_size:128MB, duration:219
cudaFree, memory_size:128MB, duration:8
cudaMallocManaged, memory_size:256MB, duration:17
init, memory_size:256MB, duration:427
cudaFree, memory_size:256MB, duration:18
cudaMallocManaged, memory_size:512MB, duration:34
init, memory_size:512MB, duration:854
cudaFree, memory_size:512MB, duration:35
cudaMallocManaged, memory_size:1024MB, duration:67
init, memory_size:1024MB, duration:1709
cudaFree, memory_size:1024MB, duration:70
cudaMallocManaged, memory_size:2048MB, duration:133
init, memory_size:2048MB, duration:3418
cudaFree, memory_size:2048MB, duration:141
cudaMallocManaged, memory_size:4096MB, duration:786
init, memory_size:4096MB, duration:4
cudaFree, memory_size:4096MB, duration:0
cudaMallocManaged, memory_size:8192MB, duration:0
init, memory_size:8192MB, duration:0
cudaFree, memory_size:8192MB, duration:0
cudaMallocManaged, memory_size:16384MB, duration:0
init, memory_size:16384MB, duration:0
cudaFree, memory_size:16384MB, duration:0
You can see that the time you have assumed is cudaFree
is the time your completely serial init
kernel takes to run. Both cudaMallocManaged
and cudaFree
do use more time as the allocations increase in size, but that isn't unreasonable IMHO.
weird thing is that 8192MB and 16384MB shows less than 0ms not only for cudaManagedMalloc and init but also for cudaFree....
That is because nothing is running. If you employ correct runtime error checking, you will see that everything is failing with out of memory runtime errors.
Upvotes: 2