Reputation: 2731
I wrote small CUDA code to understand global memory to shared memory transfer transactions. The code is as follows:
#include <iostream>
using namespace std;
__global__ void readUChar4(uchar4* c, uchar4* o){
extern __shared__ uchar4 gc[];
int tid = threadIdx.x;
gc[tid] = c[tid];
o[tid] = gc[tid];
}
int main(){
string a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
uchar4* c;
cudaError_t e1 = cudaMalloc((void**)&c, 128*sizeof(uchar4));
if(e1==cudaSuccess){
uchar4* o;
cudaError_t e11 = cudaMalloc((void**)&o, 128*sizeof(uchar4));
if(e11 == cudaSuccess){
cudaError_t e2 = cudaMemcpy(c, a.c_str(), 128*sizeof(uchar4), cudaMemcpyHostToDevice);
if(e2 == cudaSuccess){
readUChar4<<<1,128, 128*sizeof(uchar4)>>>(c, o);
uchar4* oFromGPU = (uchar4*)malloc(128*sizeof(uchar4));
cudaError_t e22 = cudaMemcpy(oFromGPU, o, 128*sizeof(uchar4), cudaMemcpyDeviceToHost);
if(e22 == cudaSuccess){
for(int i =0; i < 128; i++){
cout << oFromGPU[i].x << " ";
cout << oFromGPU[i].y << " ";
cout << oFromGPU[i].z << " ";
cout << oFromGPU[i].w << " " << endl;
}
}
else{
cout << "Failed to copy from GPU" << endl;
}
}
else{
cout << "Failed to copy" << endl;
}
}
else{
cout << "Failed to allocate output memory" << endl;
}
}
else{
cout << "Failed to allocate memory" << endl;
}
return 0;
}
This code simply copies data from device memory to shared memory and back to device memory. I have the following three questions:
cudaMalloc
allocates memory; if the memory is allocated in a haphazard manner such that the data is scattered over memory, then it will take more than 4 memory transactions. However, if cudaMalloc
allocates memory in 128 byte chunks or it allocates memory contiguously, then it should not take more than 4 memory transactions.Upvotes: 0
Views: 587
Reputation: 27809
In the code you provided (repeated here) the compiler will completely remove the shared memory store and load since they don't do anything necessary or beneficial for the code.
__global__ void readUChar4(uchar4* c, uchar4* o){
extern __shared__ uchar4 gc[];
int tid = threadIdx.x;
gc[tid] = c[tid];
o[tid] = gc[tid];
}
Assuming you did something with the shared memory so it was not eliminated, then:
uchar4
= 4*8 bits) per thread (total 128 bytes per warp). cudaMalloc
allocates memory contiguously. Upvotes: 3