Segmentation fault on cudaMalloc or cudaMemcpy

Question

New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:

  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 

  using namespace std;

  typedef struct password_t{
      char word[56];
      size_t length;
  } password;

  typedef struct libEntry_t{
      uint8_t digest[16];
      password pwd;
  } libEntry;

  // Generates a library of passwords and their corresponding MD5 hashes
  //
  // Params:
  //    numPwds - the number of passwords for which to generate hashes
  //    pwds    - the list of passwords to hash
  //    library - the array in which to store the unhashed/hashed password library
  __global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
  {
      // __device__ void cuda_md5(const password *pwd, uint8_t *digest) {
      int index = (blockIdx.x * blockDim.x) + threadIdx.x;
      uint8_t hashed[16];

      if (index < numPwds) {
        cuda_md5(&pwds[index], hashed);
        for (int j = 0; j < 16; j++) {
          library[index].digest[j] = hashed[j];
        }
        library[index].pwd = pwds[index];
      }
  }

  int crack_password (uint8_t* classified)
  {
      int count = 10;
      unsigned int mem_size = sizeof(password) * count;
      password *h_pwds = (password*) malloc(mem_size);

      ifstream inFile("passwords.txt");
      if (!inFile) {
        cerr << "File passwords.txt not found." << endl;
        return -1;
      }

      string line;
      int i;
      while (getline(inFile, line)) {
        if (line.empty()) continue;
        memcpy(h_pwds[i].word,line.c_str(),line.size());
        h_pwds[i].length = line.size();
        cout << "Password: " << h_pwds[i].word << "
";
        cout << "Length: " << h_pwds[i].length << "
";
        i++;
      }

      inFile.close();

      /***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
      password* d_pwds;
      cudaMalloc( (void**) &d_pwds, mem_size);
      cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);

      libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);

      libEntry* d_library;
      cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);

      int h_numPwds = i;
      cout << "INT NUMPWDS: " << h_numPwds << "
";

      int* d_numPwds;
      cudaMalloc( (void**) &d_numPwds, sizeof(int));
      cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);

      /*unsigned int threads_per_block = 1024;
      dim3  grid(1024, 1, 1);
      dim3  threads(threads_per_block, 1, 1);

      // generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
      generateLibraryKernel<<>>(d_numPwds[0], d_pwds, d_library);

      cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/

      return 0;
  }

  int main(int argc, char *argv[])
  {
      if (argc != 2) {
          fprintf(stderr, "usage: ./prog password
");
          return 1;
      }

      crack_password((uint8_t*) argv[1]);
      cout << "Hack Password: " << argv[1] << "
";
      return 0;
  }

I have gone through it line by line and I believe it happens on the following lines:

      int* d_numPwds;
      cudaMalloc( (void**) &d_numPwds, sizeof(int));
      cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);

When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!

How I have been checking for return status:

#define CUDA_SAFE_CALL(call) do {                                      \
  CUDA_SAFE_CALL_NO_SYNC(call);                                         \
  cudaError err = cudaThreadSynchronize();                              \
  if( cudaSuccess != err) {                                             \
     fprintf(stderr, "Cuda error in file '%s' in line %i : %s.
",      \
                 __FILE__, __LINE__, cudaGetErrorString( err) );        \
     exit(EXIT_FAILURE);                                                \
     } } while (0)

EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?

password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);

libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);

libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);

EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.

Blizzard · Accepted Answer

For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:

/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
/***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/
unsigned int threads_per_block = 1024;
dim3  grid(1024, 1, 1);
dim3  threads(threads_per_block, 1, 1);

password* d_pwds;
ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size));
ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice));

libEntry* d_library;
ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count));

// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<>>(i, d_pwds, d_library);
ERROR_CHECK( cudaPeekAtLastError() );
ERROR_CHECK( cudaDeviceSynchronize() );

Where ERROR_CHECK is defined from the link above.

#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d
", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.

Segmentation fault on cudaMalloc or cudaMemcpy

Answers (1)

Related Questions