Alex Iacob
Alex Iacob

Reputation: 43

Can I call a __device__ function from a CUDA kernel function?

I am trying to call two device functions from a CUDA Kernel function:

edit: to avoid confusion that the function definitions are on different file as the kernel definition, i provided the full code:

Complete code:



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include<iostream>
#include<fstream>
#include<string>
#include<iterator>
using namespace std;


#define POLYNOMIAL 0x04C11DB7L //Standard CRC-32 polynomial
#define M 62352 //Number of bits in the bloom filter
#define K 4  //Number of bits set per mapping in filter

typedef unsigned short int word16;
typedef unsigned int word32;

__device__ static word32 CrcTable[256]; //Table of 8-bit CRC32 remainders
__device__ char BFilter[M / 8];         //Bloom filter array of M/8 bytes
word32 NumBytes;                //Number of bytes in Bloom filter

void gen_crc_table(void);
__device__ word32 update_crc(word32 crc_accum, const char *data_ptr, word32 data_size);
__device__ void mapBloom(word32 hash);          
__device__ word32 crc32;
__device__ int retCode; 

__global__ void mapBloomKernel(const char* d_wordList, int* sizeOfWords)
{
    //access thread id
    const unsigned int bid = blockIdx.x;
    const unsigned int tid = threadIdx.x;
    const unsigned int index = bid * blockDim.x + tid;

    const char *current_word = &(*(d_wordList+(index*30)));
    for(int i=0; i<K; i++)
    {
        crc32 = update_crc(i, d_wordList+(index*30), sizeOfWords[index]);
        mapBloom(crc32);
    }

}

/*
    Main Function
*/

int main()
{
    FILE *fp1;                  
    FILE *fp2;                  
    word32 i;

    cout<<"-----------------------------------------------"<<endl;
    cout<<"--  Program to implement a general Bloom filter  --\n";
    cout<<"-----------------------------------------------"<<endl;

    //Determine number of bytes in Bloom Filter
    NumBytes = M/8;
    if((M%8)!=0)
    {
        cout<<"*** ERROR - M value must be dibisible by 8 \n";
        exit(1);
    }

    //Initialize the CRC32 table
    gen_crc_table();

    //Clear the Bloom filter
    for(i = 0; i<NumBytes; i++)
    {
        BFilter[i] = 0x00;
    }

    fp1 = fopen("word_list_10000.txt","r");
    if(fp1 == NULL)
    {
        cout<<"ERROR in opening input file #1 ***\n";
        exit(1);
    }

    fp2 = fopen("bloom_query.txt","r");
    if(fp2 == NULL)
    {
        cout<<"ERROR in opening input file #2 ***\n";
        exit(1);
    }

     //determine the number of words in list: 

     std::ifstream f("word_list_10000.txt");
     std::istream_iterator<std::string> beg(f), end;
     int number_of_words = distance(beg,end);

     cout<<"Number of words in file: "<<number_of_words<<endl;
     cout<<"size of char: "<<sizeof(char)<<endl;

     cout<<"Reading to array!:  "<<endl;
     ifstream file("word_list_10000.txt");

     const int text_length = 30;

     char *wordList = new char[10000 * text_length];
     int *sizeOfWords = new int[10000];

     for(int i=0; i<number_of_words; i++)
     {
         file>>wordList + (i*text_length);
         sizeOfWords[i] = strlen(wordList + (i*text_length));
         cout<<wordList + (i*text_length)<<endl;
     }

      char *dev_wordList;
      char *dev_sizeOfWords;

      cudaMalloc((void**)&dev_wordList, 30*number_of_words*sizeof(char));
      cudaMalloc((void**)&dev_sizeOfWords, number_of_words * sizeof(char));
      cudaMemcpy(dev_wordList, wordList, 30 * number_of_words * sizeof(char), cudaMemcpyHostToDevice);
      cudaMemcpy(dev_sizeOfWords, sizeOfWords, number_of_words * sizeof(char), cudaMemcpyHostToDevice);


    unsigned int crc_size = sizeof(word32) * 256;
    unsigned int bfilter_size = sizeof(char) * M/8;

    static word32* d_CrcTable;
    char* d_BFilter;

    cudaMalloc((void**)&d_CrcTable, crc_size);
    cudaMalloc((void**)&d_BFilter, bfilter_size);

    //copy host arrays CrcTable & BFilter to device memory

    cudaMemcpy(d_CrcTable, CrcTable, crc_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_BFilter, BFilter, bfilter_size, cudaMemcpyHostToDevice);

    //Setup execution parameters
    int n_blocks = (number_of_words + 255)/256;
    int threads_per_block = 256;

    dim3 grid(n_blocks, 1, 1);
    dim3 threads(threads_per_block, 1, 1);

    mapBloomKernel<<<grid, threads>>>(dev_wordList, sizeOfWords);

    fclose(fp1);

    //Output results header
    cout<<"----------------------------------------------------------\n";
    cout<<"Matching strings are...  \n";

    /*


    ...
    ...
    ...

    */

    fclose(fp2);
}



/*
 * Function to initialize CRC32 table
 */

void gen_crc_table(void)
{
    register word32 crc_accum;
    register word16 i, j;
    //Initialize the CRC32 8-bit look-up table
    for(i=0; i<256; i++)
    {
        crc_accum = ((word32) i<<24);
        for(j=0; j<8; j++)
        {
            if(crc_accum & 0x80000000L)
                crc_accum = (crc_accum << 1) ^POLYNOMIAL;
            else
                crc_accum = (crc_accum << 1);
        }
        CrcTable[i] = crc_accum;
        //cout<<CrcTable[i]<<endl;
    }
}

/*
 * Function to generate CRC32
 */

__device__ word32 update_crc(word32 crc_accum, char *data_blk_ptr, word32 data_blk_size)
{
    register word32 i, j;
    for(j=0; j<data_blk_size; j++)
    {
        i = ((int) (crc_accum >>24) ^ *data_blk_ptr++) & 0xFF;
        crc_accum = (crc_accum << 8) ^ CrcTable[i];
    }
    crc_accum = ~crc_accum;

    return crc_accum;
}

/*
 * Function to map hash into Bloom filter
 */

__device__ void mapBloom(word32 hash)
{
    int tempInt;
    int bitNum;
    int byteNum;
    unsigned char mapBit;
    tempInt = hash % M;
    byteNum = tempInt / 8;
    bitNum = tempInt % 8;

    mapBit = 0x80;
    mapBit = mapBit >> bitNum;

    //Map the bit into Bloom filter 
    BFilter[byteNum] = BFilter[byteNum] | mapBit;
}

/*
 * Function to test for a Bloom filter match
 */

__device__ int testBloom(word32 hash)
{
    int tempInt;
    int bitNum;
    int byteNum;
    unsigned char testBit;
    int retCode;
    tempInt = hash % M;
    byteNum = tempInt / 8;
    bitNum = tempInt % 8;

    testBit = 0x80;
    testBit = testBit >> bitNum;
    if (BFilter[byteNum] & testBit)
        retCode = 1;
    else
        retCode = 0;
    return retCode;
}

Command line used to compile:

/OUT:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.exe" /INCREMENTAL:NO
 /NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" "cudart.lib" 
"kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib"
 "ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /MANIFEST 
/ManifestFile:"Debug\CUDA_Bloom_filter_v0.exe.intermediate.manifest" /ALLOWISOLATION 
/MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG 
/PDB:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.pdb" 
/SUBSYSTEM:CONSOLE 
/PGD:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.pgd" /TLBID:1 
/DYNAMICBASE /NXCOMPAT /MACHINE:X86 /ERRORREPORT:QUEUE 

Full output:

7   IntelliSense: expected an expression e:\...\kernel.cu   145 18  CUDA_Bloom_filter_v0
    Error   6   error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing 
Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --
cl-version 2010 -ccbin "F:\Installed\Microsoft Visual Studio 2010\VC\bin"  -I"C:\Program Files\NVIDIA
 GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing 
Toolkit\CUDA\v5.5\include"  -G   --keep-dir Debug -maxrregcount=0  --machine 32 --compile -cudart 
static  -g   -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  "
 -o Debug\kernel.cu.obj 
"E:\...\kernel.cu"" exited with code 2. C:\...\CUDA 5.5.targets 592 10  CUDA_Bloom_filter_v0
    Error   5   error : **External calls are not supported** (found non-inlined call to _Z10update_crcjPKcj)    E:\...\kernel.cu    40  1   CUDA_Bloom_filter_v0

Upvotes: 1

Views: 4476

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 152174

The compiler is getting confused because your function prototype (forward declaration) looks like this:

__device__ word32 update_crc(word32 crc_accum, const char *data_ptr, word32 data_size);

but your definition is like this:

__device__ word32 update_crc(word32 crc_accum, char *data_blk_ptr, word32 data_blk_size)
{

Your function definition expects the second parameter to be of type char *. But you are passing a const char * parameter (and your forward declaration is of type const char *).

This is a basic C/C++ coding error.

Your forward declaration should match your definition. Since it did not, the compiler looked elsewhere for a matching function and could not find it.

The fix for this issue is to make your function definition match:

                                               add const here
                                               v
__device__ word32 update_crc(word32 crc_accum, const char *data_blk_ptr, word32 data_blk_size)
{

Note that when I compile your code with this fix, there are still some pretty important warnings:

t573.cu(73): warning: a __device__ variable "BFilter" cannot be directly written in a host function

t573.cu(185): warning: a __device__ variable "CrcTable" cannot be directly written in a host function

These should not be ignored. For example, taking the first warning, you have this variable:

__device__ char BFilter[M / 8];         //Bloom filter array of M/8 bytes

You cannot write this variable directly in your host code (in main):

//Clear the Bloom filter
for(i = 0; i<NumBytes; i++)
{
    BFilter[i] = 0x00;
}

Instead use a function like cudaMemcpyToSymbol()

Upvotes: 2

Related Questions