Cuda passing char** to kernel

Question

I am having a spot of bother with this basic CUDA code.

I have a char** which is a flat 2d array of passwords, my current implementation is for CUDA simply to iterate through this list and display the passwords. However, when I go to display them I simply get "(NULL)". I'm not quite sure why this is. Can someone explain what it happening?

Main:

char ** pwdAry;
pwdAry = new char *[numberOfPwd];

//pwdAry given some values (flat 2d array layout)
const int pwdArySize = sizeof(pwdAry);    
dim3 grid(gridSize,gridSize);
dim3 block(blockSize,blockSize);

searchKeywordKernel << > >(pwdAry);

return EXIT_SUCCESS;

Cuda:

__global__ void searchKeywordKernel(char **passwordList)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int pitch = blockDim.x * gridDim.x;
    int idx = x + y * pitch;
    int tidy = idx / pitch;
    int tidx = idx - (pitch * tidy);
    int bidx = tidx / blockDim.x;
    int bidy = tidy / blockDim.y;
    int currentThread = threadIdx.x + blockDim.x * threadIdx.y;

    printf("hi, i am thread: %i, and my block x: %i, and y: %i
", currentThread, bidx, bidy);
    printf("My password is: %s
", passwordList[currentThread]);
}

Robert Crovella · Accepted Answer

Based on discussion in the comments, here is an example code that roughly follows the code in the question, using 3 different methods:

Use a "flattened" array. This is the traditional advice for beginners who are asking about how to handle a double pointer array (char **, or any other type), or any data structure that contains embedded pointers. The basic idea is to create a single pointer array of the same type (e.g. char *), and copy all the data to that array, end-to-end. In this case, since the array elements are of variable length, we also need to pass an array containing the starting indices of each string (in this case).
Use a direct double-pointer method. I consider this code difficult to write. It may also have performance implications. The canonical example is here, and a stepwise description of what is required algorithmically is here and/or here is a 3D (i.e. triple-pointer) worked example with method description (yuck!). This is fundamentally doing a deep-copy in CUDA, and I consider it somewhat more difficult than typical CUDA coding.
Use the managed memory subsystem, that is available in CUDA platforms that support it. Coding-wise, this is probably simpler than either of the above 2 approaches.

Here is a worked example of all 3 methods:

$ cat t1035.cu
#include 
#include 

#define nTPB 256

__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s
", idx, data+indices[idx]);
}

__global__ void kern_2D(char **data, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s
", idx, data[idx]);
}

int main(){

  const int num_strings = 3;
  const char s0[] = "s1\0";
  const char s1[] = "s2\0";
  const char s2[] = "s3\0";
  int ds[num_strings];
  ds[0] = sizeof(s0)/sizeof(char);
  ds[1] = sizeof(s1)/sizeof(char);
  ds[2] = sizeof(s2)/sizeof(char);
  // pretend we have a dynamically allocated char** array
  char **data;
  data = (char **)malloc(num_strings*sizeof(char *));
  data[0] = (char *)malloc(ds[0]*sizeof(char));
  data[1] = (char *)malloc(ds[1]*sizeof(char));
  data[2] = (char *)malloc(ds[2]*sizeof(char));
  // initialize said array
  strcpy(data[0], s0);
  strcpy(data[1], s1);
  strcpy(data[2], s2);
  // method 1: "flattening"
  char *fdata = (char *)malloc((ds[0]+ds[1]+ds[2])*sizeof(char));
  unsigned *ind   = (unsigned *)malloc(num_strings*sizeof(unsigned));
  unsigned next = 0;
  for (int i = 0; i < num_strings; i++){
    strcpy(fdata+next, data[i]);
    ind[i] = next;
    next += ds[i];}
  //copy to device
  char *d_fdata;
  unsigned *d_ind;
  cudaMalloc(&d_fdata, next*sizeof(char));
  cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
  cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
  cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
  printf("method 1:
");
  kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
  cudaDeviceSynchronize();
  //method 2: "2D" (pointer-to-pointer) array
  char **d_data;
  cudaMalloc(&d_data, num_strings*sizeof(char *));
  char **d_temp_data;
  d_temp_data = (char **)malloc(num_strings*sizeof(char *));
  for (int i = 0; i < num_strings; i++){
    cudaMalloc(&(d_temp_data[i]), ds[i]*sizeof(char));
    cudaMemcpy(d_temp_data[i], data[i], ds[i]*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
  printf("method 2:
");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
  cudaDeviceSynchronize();
  // method 3: managed allocations
  // start over with a managed char** array
  char **m_data;
  cudaMallocManaged(&m_data, num_strings*sizeof(char *));
  cudaMallocManaged(&(m_data[0]), ds[0]*sizeof(char));
  cudaMallocManaged(&(m_data[1]), ds[1]*sizeof(char));
  cudaMallocManaged(&(m_data[2]), ds[2]*sizeof(char));
  // initialize said array
  strcpy(m_data[0], s0);
  strcpy(m_data[1], s1);
  strcpy(m_data[2], s2);
  // call kernel directly on managed data
  printf("method 3:
");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
  cudaDeviceSynchronize();

  return 0;
}


$ nvcc -arch=sm_35 -o t1035 t1035.cu
$ cuda-memcheck ./t1035
========= CUDA-MEMCHECK
method 1:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
method 2:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
method 3:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
========= ERROR SUMMARY: 0 errors
$

Notes:

I suggest running this code with cuda-memcheck if you are just testing it out for the first time. I have omitted proper cuda error checking for brevity of presentation, but I recommend it any time you are having trouble with a CUDA code. Proper execution of this code depends on having a managed memory subsystem available (read the doc links I have provided). If your platform does not support it, running this code as-is will probably result in a seg fault, because I have not included proper error checking.
Copying a double-pointer array from device to host, although not explicitly covered in this example, is essentially the reverse of the steps for each of the 3 methods. For method 1, a single cudaMemcpy call can do it. For method 2, it requires a for-loop that reverses the steps to copy to the device (including the use of the temp pointers). For method 3, nothing at all is required, other than proper adherence to managed memory coding practices, such as use of cudaDeviceSynchronize() after a kernel call, before attempting to access the device from host code again.
I don't wish to argue about whether or not methods 1 and 3 explicitly adhere to the letter of the question in terms of providing a method to pass a char ** array to a CUDA kernel. If your focus is that narrow, then please use method 2, or else disregard this answer entirely.

EDIT: Based on a question in the comments below, here is the above code modified with a different initialization sequence for the host-side strings (at line 42). There are now compilation warnings, but those warnings arise from the code specifically requested to be used by OP:

$ cat t1036.cu
#include 
#include 

#define nTPB 256

__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s
", idx, data+indices[idx]);
}

__global__ void kern_2D(char **data, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s
", idx, data[idx]);
}

int main(){

  const int num_strings = 3;
#if 0
  const char s0[] = "s1\0";
  const char s1[] = "s2\0";
  const char s2[] = "s3\0";
  int ds[num_strings];
  ds[0] = sizeof(s0)/sizeof(char);
  ds[1] = sizeof(s1)/sizeof(char);
  ds[2] = sizeof(s2)/sizeof(char);
  // pretend we have a dynamically allocated char** array
  char **data;
  data = (char **)malloc(num_strings*sizeof(char *));
  data[0] = (char *)malloc(ds[0]*sizeof(char));
  data[1] = (char *)malloc(ds[1]*sizeof(char));
  data[2] = (char *)malloc(ds[2]*sizeof(char));
  // initialize said array
  strcpy(data[0], s0);
  strcpy(data[1], s1);
  strcpy(data[2], s2);
#endif
  char ** pwdAry; pwdAry = new char *[num_strings]; for (int a = 0; a < num_strings; a++) { pwdAry[a] = new char[1024]; } for (int a = 0; a < 3; a++) { pwdAry[a] = "hello\0"; }
  // method 1: "flattening"
  char *fdata = (char *)malloc((1024*num_strings)*sizeof(char));
  unsigned *ind   = (unsigned *)malloc(num_strings*sizeof(unsigned));
  unsigned next = 0;
  for (int i = 0; i < num_strings; i++){
    memcpy(fdata+next, pwdAry[i], 1024);
    ind[i] = next;
    next += 1024;}
  //copy to device
  char *d_fdata;
  unsigned *d_ind;
  cudaMalloc(&d_fdata, next*sizeof(char));
  cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
  cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
  cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
  printf("method 1:
");
  kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
  cudaDeviceSynchronize();
  //method 2: "2D" (pointer-to-pointer) array
  char **d_data;
  cudaMalloc(&d_data, num_strings*sizeof(char *));
  char **d_temp_data;
  d_temp_data = (char **)malloc(num_strings*sizeof(char *));
  for (int i = 0; i < num_strings; i++){
    cudaMalloc(&(d_temp_data[i]), 1024*sizeof(char));
    cudaMemcpy(d_temp_data[i], pwdAry[i], 1024*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
  printf("method 2:
");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
  cudaDeviceSynchronize();
  // method 3: managed allocations
  // start over with a managed char** array
  char **m_data;
  cudaMallocManaged(&m_data, num_strings*sizeof(char *));
  cudaMallocManaged(&(m_data[0]), 1024*sizeof(char));
  cudaMallocManaged(&(m_data[1]), 1024*sizeof(char));
  cudaMallocManaged(&(m_data[2]), 1024*sizeof(char));
  // initialize said array
  for (int i = 0; i < num_strings; i++)
    memcpy(m_data[i], pwdAry[i], 1024);
  // call kernel directly on managed data
  printf("method 3:
");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
  cudaDeviceSynchronize();

  return 0;
}


$ nvcc -arch=sm_35 -o t1036 t1036.cu
t1036.cu(42): warning: conversion from a string literal to "char *" is deprecated

t1036.cu(42): warning: conversion from a string literal to "char *" is deprecated

$ cuda-memcheck ./t1036
========= CUDA-MEMCHECK
method 1:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
method 2:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
method 3:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
========= ERROR SUMMARY: 0 errors
$

Cuda passing char** to kernel

Answers (1)

Related Questions