Reputation: 5940
I have a simple script formed by 2 CUDA files: main.cu and kernel.cu. Their goal is to calculate the sum of 2 vectors.
// main.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "kernel.cu"
int main(){
/* Error code to check return values for CUDA calls */
cudaError_t err = cudaSuccess;
srand(time(NULL));
int count = 100;
int A[count], B[count];
int *h_A, *h_B;
h_A = A; h_B = B;
int i;
for(i=0;i<count;i++){
*(h_A+i) = rand() % count; /* Oppure: h_A[i] = rand() % count; */
*(h_B+i) = rand() % count; /* Oppure: h_B[i] = rand() % count; */
}
/* Display dei vettori A e B. */
printf("\nPrimi cinque valori di A = ");
for(i=0;i<4;i++){printf("%d ", A[i]);}
printf("\nPrimi cinque valori di B = ");
for(i=0;i<4;i++){printf("%d ", B[i]);}
int *d_A, *d_B;
err = cudaMalloc((void**)&d_A, count*sizeof(int));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector A (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_B, count*sizeof(int));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector A (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_A, A, count*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_B, B, count*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
int numThreads = 256;
int numBlocks = count/numThreads + 1;
AddInts<<<numBlocks,numThreads>>>(d_A,d_B); err = cudaGetLastError();
err = cudaMemcpy(A, d_A, count*sizeof(int), cudaMemcpyDeviceToHost);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_A);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_B);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
printf("\nPrimi cinque valori di A = ");
for(i=0;i<4;i++){printf("%d ", A[i]);}
printf("\n");
return 0;}
Here there's the kernel.cu file:
// kernel.cu
__device__ int get_global_index(){
return (blockIdx.x * blockDim.x) + threadIdx.x;
}
__global__ void AddInts(int *a, int *b){
int ID = get_global_index();
*(a+ID) += *(b+ID);
}
I am 100 % sure that the main.cu script is correct; I also know that i could just add the kernel directly in the the main script but that is not the intention of my test; I also know that I could just get rid of the __device__
function and put it directly inside of the __global__
but it's not my intention either.
When i compile the test by typing nvcc main.cu kernel.cu
in the terminal I get however the following error message:
/tmp/tmpxft_0000248b_00000000-30_kernel.o: In function `get_global_index()':
tmpxft_0000248b_00000000-8_kernel.cudafe1.cpp:(.text+0x15): multiple definition of ` get_global_index()'
/tmp/tmpxft_0000248b_00000000-21_main.o:tmpxft_0000248b_00000000-3_main.cudafe1.cpp:(.text+0x15): first defined here
/tmp/tmpxft_0000248b_00000000-30_kernel.o: In function `__device_stub__Z7AddIntsPiS_(int*, int*)':
tmpxft_0000248b_00000000-8_kernel.cudafe1.cpp:(.text+0x7c): multiple definition of `__device_stub__Z7AddIntsPiS_(int*, int*)'
/tmp/tmpxft_0000248b_00000000-21_main.o:tmpxft_0000248b_00000000-3_main.cudafe1.cpp:(.text+0x68e): first defined here
/tmp/tmpxft_0000248b_00000000-30_kernel.o: In function `AddInts(int*, int*)':
tmpxft_0000248b_00000000-8_kernel.cudafe1.cpp:(.text+0xe5): multiple definition of `AddInts(int*, int*)'
/tmp/tmpxft_0000248b_00000000-21_main.o:tmpxft_0000248b_00000000-3_main.cudafe1.cpp:(.text+0x6f7): first defined here
collect2: error: ld returned 1 exit status
I believe the error is caused by the definition of the the device function called get_global_index() but I don't understand what's wrong with it; does anyone have any idea of what it is wrong?
Upvotes: 0
Views: 2522
Reputation: 151889
Two options:
Just compile main.cu (nvcc main.cu
) It will already pick up kernel.cu
, since you're including it.
Don't include kernel.cu
in main.cu
.
When you include kernel.cu
in main.cu
(and pass both files to the compiler) it causes the compiler to compile that code (kernel.cu) twice, once when it is compiling main.cu
, and once when it is compiling kernel.cu
. If you choose this option, you'll need to provide a prototype (forward reference) for the AddInts
kernel in main.cu
, perhaps by inclusion of a header file with just that prototype. And in the more general case, if you spread things out into more files, you may need to add -rdc=true
to your compile command line, if you have files with __global__
functions that are referencing __device__
functions in other files, for example.
Upvotes: 4