Bessa
Bessa

Reputation: 117

makefile: cpp + cu files - error of undefined reference

Consider three files: main.cpp, func_prototypes.h and test_kernels.cu. I'd like to build a project out from those three files. I tried to base on CUDA samples to construct a 'makefile', but I fail. The execution of make return an error of undefined reference. In what follows, there are my three files and the makefile

main.cpp

#include <iostream>

#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>

#include <new>

#include <cuda.h>

using namespace std;

#include "func_prototypes.h"

typedef float mytype;

int main(){

    mytype *vec;
    unsigned int N = 1024;

    vec = new mytype[N];

    for(int i = 0; i < N; i++){
        vec[i] = i;
    }

    cout << "Calling CUDA function.\n";

    getSquares(vec,N);

    cout << "result:\n";

    for(int i = 0; i < N; i++){
        cout << vec[i] << " ";
    }

    ResetCUDA();
    return EXIT_SUCCESS;

}

func_prototypes.h

template <class type>
void getSquares(type *v, unsigned const int N);
void ResetCUDA();

test_kernels.cu

#include <cuda.h>
#include <new>

#define BlockSize 256

template <class type>
__global__
void getSquareKernel(type *v, unsigned const int N){
    int tIdx = blockIdx.x*blockDim.x + threadIdx.x;

    if(tIdx < N){
        v[tIdx] *= v[tIdx];
    }
}



template <class type>
void getSquares(type *v, unsigned const int N){

    int threads = BlockSize;
    int blocks = ceil(N/threads);

    type *d_v;
    cudaMalloc(&d_v,N*sizeof(type));
    cudaMemcpy(d_v,v,N*sizeof(type),cudaMemcpyHostToDevice);

    getSquareKernel<<<blocks,threads>>>(d_v,N);

    cudaMemcpy(v,d_v,N*sizeof(type),cudaMemcpyDeviceToHost);

    cudaFree(d_v);

}

void ResetCUDA(){
    cudaDeviceReset();
}

makefile

############################# Makefile ##########################
CUDA_PATH       ?= /usr/local/cuda-5.0
CUDA_INC_PATH   ?= $(CUDA_PATH)/include
CUDA_BIN_PATH   ?= $(CUDA_PATH)/bin

ifeq ($(OS_SIZE),32)
    CUDA_LIB_PATH  ?= $(CUDA_PATH)/lib
else
    CUDA_LIB_PATH  ?= $(CUDA_PATH)/lib64
endif

ifeq ($(OS_SIZE),32)
    LDFLAGS :=  -L$(CUDA_LIB_PATH) -lcudart
    CPPFLAGS    :=  -m32
else
    LDFLAGS :=  -L$(CUDA_LIB_PATH) -lcudart
    CPPFLAGS    :=  -m64
endif
# Debug build flags
ifeq ($(dbg),1)
      CPPFLAGS      += -g
      NVCCFLAGS = -g -G
endif

INCLUDES    := -I$(CUDA_INC_PATH) -I. -I.. -I../../common/inc

CPP     =       icpc
NVCC        =       $(CUDA_BIN_PATH)/nvcc

SOURCE  =       main.cpp
AUX     =       test_kernels.cu

all:    test
test_kernels.o: $(AUX)
    $(NVCC) $(NVCCFLAGS) -o test_kernels.o -c $(AUX) $(NVCCFLAGS) $(INCLUDES)
main.o: $(SOURCE)
    $(CPP) $(CPPFLAGS) -o main.o -c $(SOURCE) $(CPPFLAGS) $(INCLUDES)
test:   test_kernels.o  main.o
    $(CPP) -o test test_kernels.o main.o $(LDFLAGS)
run: test
    ./test
clean:
    rm -rf test *.o

The returned error is main.o:main.cpp:function main: error: undefined reference to 'void getSquares<float>(float*, unsigned int)' make: *** [test] Error 1

Does anyone know where is my mistake?

EDIT: For the record, my OS is Ubuntu 12.04 x86_64, kernel 3.2.0-39

Upvotes: 3

Views: 1498

Answers (1)

talonmies
talonmies

Reputation: 72349

This is a manifestation of the standard gotcha in template compilation.

Your host function getSquares and kernel getSquareKernel were never instantiated in the compilation unit where they were defined (ie inside test_kernels.cu). Therefore the compiler never emits any code for getSquares, and the linkage fails.

Because you are working with a combined host code/device code compilation trajectory in test_kernels.cu, the correct solution is to explicitly instantiate all the variations of the template code you will need inside test_kernels.cu, by adding something like this:

template __global__ void getSquareKernel<float>(float *, unsigned int);
template void getSquares<float>(float *, unsigned int);

to the bottom of test_kernels.cu. This will ensure that both the device and host code instances you need to link this are present at link time.

The other alternative would be to change the file containing the main to a .cu file and include test_kernels.cu into that file and compile the whole thing with nvcc. In that case, the instantiation of the host class inside main() should trigger compilation of the complete template chain within the same compilation unit.

Disclaimer: I don't have a machine in front of me which I can test any on this on, so caveat emptor for at least the code...

Upvotes: 4

Related Questions