Reputation: 117
Consider three files: main.cpp, func_prototypes.h and test_kernels.cu. I'd like to build a project out from those three files. I tried to base on CUDA samples to construct a 'makefile', but I fail. The execution of make return an error of undefined reference. In what follows, there are my three files and the makefile
main.cpp
#include <iostream>
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <new>
#include <cuda.h>
using namespace std;
#include "func_prototypes.h"
typedef float mytype;
int main(){
mytype *vec;
unsigned int N = 1024;
vec = new mytype[N];
for(int i = 0; i < N; i++){
vec[i] = i;
}
cout << "Calling CUDA function.\n";
getSquares(vec,N);
cout << "result:\n";
for(int i = 0; i < N; i++){
cout << vec[i] << " ";
}
ResetCUDA();
return EXIT_SUCCESS;
}
func_prototypes.h
template <class type>
void getSquares(type *v, unsigned const int N);
void ResetCUDA();
test_kernels.cu
#include <cuda.h>
#include <new>
#define BlockSize 256
template <class type>
__global__
void getSquareKernel(type *v, unsigned const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
if(tIdx < N){
v[tIdx] *= v[tIdx];
}
}
template <class type>
void getSquares(type *v, unsigned const int N){
int threads = BlockSize;
int blocks = ceil(N/threads);
type *d_v;
cudaMalloc(&d_v,N*sizeof(type));
cudaMemcpy(d_v,v,N*sizeof(type),cudaMemcpyHostToDevice);
getSquareKernel<<<blocks,threads>>>(d_v,N);
cudaMemcpy(v,d_v,N*sizeof(type),cudaMemcpyDeviceToHost);
cudaFree(d_v);
}
void ResetCUDA(){
cudaDeviceReset();
}
makefile
############################# Makefile ##########################
CUDA_PATH ?= /usr/local/cuda-5.0
CUDA_INC_PATH ?= $(CUDA_PATH)/include
CUDA_BIN_PATH ?= $(CUDA_PATH)/bin
ifeq ($(OS_SIZE),32)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64
endif
ifeq ($(OS_SIZE),32)
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CPPFLAGS := -m32
else
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CPPFLAGS := -m64
endif
# Debug build flags
ifeq ($(dbg),1)
CPPFLAGS += -g
NVCCFLAGS = -g -G
endif
INCLUDES := -I$(CUDA_INC_PATH) -I. -I.. -I../../common/inc
CPP = icpc
NVCC = $(CUDA_BIN_PATH)/nvcc
SOURCE = main.cpp
AUX = test_kernels.cu
all: test
test_kernels.o: $(AUX)
$(NVCC) $(NVCCFLAGS) -o test_kernels.o -c $(AUX) $(NVCCFLAGS) $(INCLUDES)
main.o: $(SOURCE)
$(CPP) $(CPPFLAGS) -o main.o -c $(SOURCE) $(CPPFLAGS) $(INCLUDES)
test: test_kernels.o main.o
$(CPP) -o test test_kernels.o main.o $(LDFLAGS)
run: test
./test
clean:
rm -rf test *.o
The returned error is main.o:main.cpp:function main: error: undefined reference to 'void getSquares<float>(float*, unsigned int)'
make: *** [test] Error 1
Does anyone know where is my mistake?
EDIT: For the record, my OS is Ubuntu 12.04 x86_64, kernel 3.2.0-39
Upvotes: 3
Views: 1498
Reputation: 72349
This is a manifestation of the standard gotcha in template compilation.
Your host function getSquares
and kernel getSquareKernel
were never instantiated in the compilation unit where they were defined (ie inside test_kernels.cu). Therefore the compiler never emits any code for getSquares
, and the linkage fails.
Because you are working with a combined host code/device code compilation trajectory in test_kernels.cu, the correct solution is to explicitly instantiate all the variations of the template code you will need inside test_kernels.cu, by adding something like this:
template __global__ void getSquareKernel<float>(float *, unsigned int);
template void getSquares<float>(float *, unsigned int);
to the bottom of test_kernels.cu. This will ensure that both the device and host code instances you need to link this are present at link time.
The other alternative would be to change the file containing the main to a .cu file and include test_kernels.cu into that file and compile the whole thing with nvcc. In that case, the instantiation of the host class inside main()
should trigger compilation of the complete template chain within the same compilation unit.
Disclaimer: I don't have a machine in front of me which I can test any on this on, so caveat emptor for at least the code...
Upvotes: 4