Reputation: 737
I have a CUDA application where I am trying to use constant memory. But when I am writing the kernel in the same file where the main function is, then only the data in the constant memory is getting recognized inside the kernel. Otherwise if I declare the kernel function in some other file then the constant memory is becoming 0 and the operation is operating properly. I am providing a simple dummy code which would explain the problem more easily. This program have a 48x48 matrix divided into 16x16 blocks and I am storing random numbers 1 to 50 in it. Inside the kernel I am adding numbers stored in constant memory to the each rows in a block. The code is given below :
Header File:
#include <windows.h>
#include <dos.h>
#include <stdio.h>
#include <conio.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <curand.h>
#include <curand_kernel.h>
__constant__ int test_cons[16];
__global__ void test_kernel_1(int *,int *);
Main Program :
int main(int argc,char *argv[])
{ int *mat,*dev_mat,*res,*dev_res;
int i,j;
int test[16 ] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
cudaMemcpyToSymbol(test_cons,test,16*sizeof(int));
mat = (int *)malloc(48*48*sizeof(int));
res = (int *)malloc(48*48*sizeof(int));
memset(res,0,48*48*sizeof(int));
srand(time(NULL));
for(i=0;i<48;i++)
{ for(j=0;j<48;j++)
{ mat[i*48+j] = rand()%(50-1)+1;
printf("%d\t",mat[i*48+j] );
}
printf("\n");
}
cudaMalloc((void **)&dev_mat,48*48*sizeof(int));
cudaMemcpy(dev_mat,mat,48*48*sizeof(int),cudaMemcpyHostToDevice);
cudaMalloc((void **)&dev_res,48*48*sizeof(int));
dim3 gridDim(48/16,48/16,1);
dim3 blockDim(16,16,1);
test_kernel_1<<< gridDim,blockDim>>>(dev_mat,dev_res);
cudaMemcpy(res,dev_res,48*48*sizeof(int),cudaMemcpyDeviceToHost);
printf("\n\n\n\n");
for(i=0;i<48;i++)
{ for(j=0;j<48;j++)
{ printf("%d\t",res[i*48+j] );
}
printf("\n");
}
cudaFree(dev_mat);
cudaFree(dev_res);
free(mat);
free(res);
exit(0);
}
Kernel Function :
__global__ void test_kernel_1(int *dev_mat,int* dev_res)
{
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x +threadIdx.x;
dev_res[row*48+col] = dev_mat[row*48+col] + test_cons[threadIdx.x];
}
Now when I am declaring the kernel function inside the main program file along with the main program then the constant memory values are correct otherwise if it is in a different file the test_cons[threadIdx.x]
values are becoming 0.
I came across this link which kind of discuss the same problem but I am not getting it properly. It would be very much helpful if someone could tell me why this is happening and what I need to do avoid this problem. Any sort of help would be highly appreciated. Thanks.
Upvotes: 0
Views: 220
Reputation: 1301
The above answer is totally acceptable I am adding this since the user is not able to make it working. You can accept the above answer this is just for your reference.
Kernel.cu file:
#include <stdio.h>
__constant__ int test_cons[16];
void copymemory (int *test)
{
cudaMemcpyToSymbol(test_cons,test,16*sizeof(int));
}
__global__ void test_kernel_1(int *dev_mat,int* dev_res)
{
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x +threadIdx.x;
if (threadIdx.x ==0)
{
printf ("testcons[0] is %d\n", test_cons[threadIdx.x]) ;
}
dev_res[row*48+col] = dev_mat[row*48+col] + test_cons[threadIdx.x];
}
simple.cu file
#include <stdio.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>
void copymemory (int *temp) ;
__global__ void test_kernel_1(int *,int *);
int main(int argc,char *argv[])
{
int *mat,*dev_mat,*res,*dev_res;
int i,j;
int test[16 ] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
mat = (int *)malloc(48*48*sizeof(int));
res = (int *)malloc(48*48*sizeof(int));
memset(res,0,48*48*sizeof(int));
copymemory (test) ;
srand(time(NULL));
for(i=0;i<48;i++)
{
for(j=0;j<48;j++)
{
mat[i*48+j] = rand()%(50-1)+1;
//printf("%d\t",mat[i*48+j] );
}
//printf("\n");
}
cudaMalloc((void **)&dev_mat,48*48*sizeof(int));
cudaMemcpy(dev_mat,mat,48*48*sizeof(int),cudaMemcpyHostToDevice);
cudaMalloc((void **)&dev_res,48*48*sizeof(int));
dim3 gridDim(48/16,48/16,1);
dim3 blockDim(16,16,1);
test_kernel_1<<< gridDim,blockDim>>>(dev_mat,dev_res);
cudaMemcpy(res,dev_res,48*48*sizeof(int),cudaMemcpyDeviceToHost);
for(i=0;i<48;i++)
{
for(j=0;j<48;j++)
{
// printf("%d\t",res[i*48+j] );
}
//printf("\n");
}
cudaFree(dev_mat);
cudaFree(dev_res);
free(mat);
free(res);
exit(0);
}
I have commented your printf. And the printf in the kernel prints the value 1. I also tested by changing the value of test[0] in main function and it works perfectly.
Upvotes: 1
Reputation: 151819
I just recently answered a similar question here
CUDA can handle code that references device code (entry points) or symbols in other files, but it requires separate compilation with device linking (as described and linked in the link I gave above). (And separate compilation/linking requires CC 2.0 or greater)
So if you modify the link steps you can have your __constant__
variable in a given file, and reference it from a different file.
If not (if you don't specify separate compilation and device linking), then the device code that references the __constant__
variable, the host code that references the __constant__
variable, and the definition/declaration of the variable itself, all need to be in the same file.
So this:
__constant__ int test_cons[16];
This:
cudaMemcpyToSymbol(test_cons,test,16*sizeof(int));
And this:
dev_res[row*48+col] = dev_mat[row*48+col] + test_cons[threadIdx.x];
all need to be in the same file.
Upvotes: 2