Program hit cudaErrorIllegalAdress without cuda-memcheck error when running program with a large dataset

Question

I'm new to cuda and was working on a small project to learn how to use it when I came across the following error when executing my program:

CUDA error at main.cu:89 code=77(cudaErrorIllegalAddress) "cudaMemcpy(HMAT,MAT,int(size*sizeof(float)),cudaMemcpyDeviceToHost)"

The program purpose is to obtain a kmer distance matrix, so the program takes as input a fasta file with sequences and the k which will be used to calculate the distance between the sequences. *A fasta file structure is fairly simple, it has a series of headers and sequences:

>Header 1
ACGTAGT...TCG
>Header 2
CAGAGT...ACT

The program is structured as follows:

The fasta file is read, this is made on host. The headers are stored on a vector and the sequences are stored on a vector and also on a char**, when I first made the program, without parallelization, I used vectors and strings but since I realized it would be difficult to translate those vectors and string to other device compatible string and vector like structures I decided to use the option char**.
The kmer distance is calculated, this is done on device via the kmdist function, which is called for each string (sequence) comparison (A lot of functions are called inside this part):

2.1. A string is transformed into a string with all substrings of size k, for example the string "abcde" comes in, and if we have a k = 3, the string "abcbcdcde" comes out, so if the original string is of size N, then the string that comes out has size (N-k+1)*k.

2.2. The substrings string is alphabetically sorted (this can be improved but it will be done later), for example if the string "cddabcaba" comes in, then the string "abaabccdd" comes out.

2.3. Two substring strings are compared too see how many substrings are shared, for example: the substrings "abccde" and the substring "abccddcde" would return a 2, since they share the substrings "abc" and "cde".

2.4. The kmer distance is calculated using a formula.

It is also worth mentioning that even if N*N (N being the total number of sequences) threads are reserved, only the necessary comparisons are made, this is because the matrix is symmetrical and the diagonal has 0's, I know that there must be a better way, but I couldn't think of any other option.

The source code is:

aligner.h:

#ifndef ALIGNER_H
#define ALIGNER_H
#include 
#include 
#include 
#include 
#include 
#include 
using namespace std;

class aligner{
public:
  __host__ aligner();
  __host__ aligner(string name);
  __host__ ~aligner();
  __host__ double length();
  __host__ void getReads();
  char** h_reads;
  vector* reads;
  vector* headers;
  __device__ static void swapp(char* a,int id1,int id2,int k);
  __device__ static double compare(char* v1,char* v2,int* k);
  __device__ static float kmdist(char* A,char* B,int* k);
  __device__ static void veckm(char *test,char *a,int k);
private:
  string fileName;
  double noReads;
};
#endif

aligner.cu:

#include "aligner.h"

__host__ aligner::aligner(){}
__host__ aligner::aligner(string name){
  fileName = name;
}
__host__ aligner::~aligner(){}

__host__ double aligner::length(){
    double counter = 0;
    ifstream file(fileName.c_str());
    string data;
    while(getline(file,data)){
        if(data[0] == '>')counter++;
    }
    noReads =  counter;
    return counter;
}
__host__ void aligner::getReads(){
  reads = new vector(noReads);
  h_reads = new char*[int(noReads)];
  headers =  new vector(noReads);
  ifstream file(fileName.c_str());
  stringstream buffer;
  buffer << file.rdbuf();
  string data,id,seq;
  double indx = 0;
  while (getline(buffer, data)) {
    if(data.empty())
        continue;
    if(data[0] == '>') {
      if(!id.empty())
        {reads->at(indx) = seq;
        h_reads[int(indx)] = (char*)malloc((seq.size()+1)*sizeof(char));
        strcpy(h_reads[int(indx)],seq.c_str());
        h_reads[seq.size()] = '\0';
        headers->at(indx) = id;
        indx++;}
      id = data.substr(1);
      seq.clear();
    }
    else {
      seq += data;
    }
  }
  if(!id.empty()){
    reads->at(indx) = seq;
    h_reads[int(indx)] = (char*)malloc((seq.size()+1)*sizeof(char));
    strcpy(h_reads[int(indx)],seq.c_str());
    h_reads[seq.size()] = '\0';
    headers->at(indx) = id;
  }
}
__device__ void aligner::swapp(char* a,int id1,int id2,int k){
  char c;
  for(int i=0;i b[i]){return false;}
  }
  return false;
}
__device__ void alphsort(char* arr,int size,int k){
  char *tmp1,*tmp2;
  tmp2 = (char*)malloc(k*sizeof(char));
  tmp1 = (char*)malloc(k*sizeof(char));
  for(int i = 0;i<(size*k);i+=k){
    for(int l = 0;l<(size-1)*k;l+=k){
      for(int j = 0;j b[i]){return 1;} //b es mayor
  }
  return 2; //son iguales
}

__device__ double aligner::compare(char* v1 , char* v2,int* k){
  int len1 = 0,len2=0;
  double res = 0;
  int dk = *k;
  while (v1[len1] !='\0') {
    len1++;}
  while (v2[len2] !='\0') {
    len2++;}
  int size1 = len1/dk;
  int size2 = len2/dk;
  alphsort(v1,size1,dk);
  alphsort(v2,size2,dk);
  int i = 0,j=0;
  char *tmp1, *tmp2;
  tmp1 = (char*)malloc(dk*sizeof(char));
  tmp2 = (char*)malloc(dk*sizeof(char));
  int mm;
  while (i


main.cu:
#include "aligner.h"
#include 
#include 
__global__ void kernel(float* MAT,char** a,int *k,double *len){
  int col = blockIdx.y*blockDim.y+threadIdx.y;
  int row = blockIdx.x*blockDim.x+threadIdx.x;
  int l = int(*len);
  if(col < l && row < l){
    if(col>row){
      MAT[row*l+col] = aligner::kmdist(a[row],a[col],k);
    }
  }
}
void compare(float* MAT,float* HMAT,double size,double len,char* r[],int k);
int main(int argc, char const *argv[]) {
  string file = argv[1];
  int K = atoi(argv[2]);
  aligner objAl(file);
  double len = objAl.length();
  objAl.getReads();
  double size = len*len;
  float* h_mat;
  h_mat = (float*)malloc(int(size*sizeof(float)));
  memset(h_mat,0,int(size*sizeof(float)));
  float* d_mat;
  checkCudaErrors(cudaMalloc((void**)&d_mat,int(size*sizeof(float))));
  checkCudaErrors(cudaMemcpy(d_mat,h_mat,int(size*sizeof(float)),cudaMemcpyHostToDevice));
  compare(d_mat,h_mat,size,len,objAl.h_reads,K);
  //This was used to print the matrix and observe the output when it was small
  /*for(int i = 0;i 256){
    threadsPerBlock.x = 16;
    threadsPerBlock.y = 16;
    blocksPerGrid.x = ceil(double(len)/double(threadsPerBlock.x));
    blocksPerGrid.y = ceil(double(len)/double(threadsPerBlock.y));
  }
  //Take time
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  //Parallel function
  cudaEventRecord(start,0);
  kernel<<>>(MAT,d_reads,d_k,d_len);
  cudaDeviceSynchronize();
  cudaEventRecord(stop,0);
  cudaEventSynchronize(stop);
  float timer = 0;
  cudaEventElapsedTime(&timer,start,stop);
  cout << "Elapsed parallel time:" << timer/1000 << "seconds" << endl;
  cudaDeviceSynchronize();
  checkCudaErrors(cudaMemcpy(HMAT,MAT,int(size*sizeof(float)),cudaMemcpyDeviceToHost));
  cudaDeviceSynchronize();
}

It was compiled with the following command:
nvcc -rdc=true -lineinfo main.cu aligner.cu -o main

And when it ran, this happened:
-bash-4.1$ ./main ../prueba100.fasta 4
Elapsed parallel time:0seconds
CUDA error at main.cu:89 code=77(cudaErrorIllegalAddress) "cudaMemcpy(HMAT,MAT,int(size*sizeof(float)),cudaMemcpyDeviceToHost)" 

And when I used cuda-memcheck to run the code this was the output:
-bash-4.1$ cuda-memcheck ./main ../prueba100.fasta 4
========= CUDA-MEMCHECK
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaDeviceSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x404d6]
=========     Host Frame:./main [0x51a5]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaEventRecord. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x57572]
=========     Host Frame:./main [0x51b6]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaEventSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x513ae]
=========     Host Frame:./main [0x51c2]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaEventElapsedTime. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x5106c]
=========     Host Frame:./main [0x51e7]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
Elapsed parallel time:0seconds
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaDeviceSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x404d6]
=========     Host Frame:./main [0x5244]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
CUDA error at main.cu:89 code=77(cudaErrorIllegalAddress) "cudaMemcpy(HMAT,MAT,int(size*sizeof(float)),cudaMemcpyDeviceToHost)" 
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaMemcpy. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x42cbf]
=========     Host Frame:./main [0x527d]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
========= Error: process didn't terminate successfully
=========        The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors.
========= No CUDA-MEMCHECK results found

I then used cuda-gdb to see if I could diagnose the problem, but I dont know if I'm not using it correctly so I dont know what the output is telling me or if it isn´t helpful at all.
(cuda-gdb) run ../prueba100.fasta 3
Starting program: /home/ulises2/parallel_programing/ivan/Proyecto/Parallel/main ../prueba100.fasta 3
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: File "/share/apps/gcc/gcc6/lib64/libstdc++.so.6.0.22-gdb.py" auto-loading has been declined by your `auto-load safe-path' set to "$debugdir:$datadir/auto-load".
To enable execution of this file add
    add-auto-load-safe-path /share/apps/gcc/gcc6/lib64/libstdc++.so.6.0.22-gdb.py
line to your configuration file "/export/home/ulises2/.cuda-gdbinit".
To completely disable this security protection add
    set auto-load safe-path /
line to your configuration file "/export/home/ulises2/.cuda-gdbinit".
For more information about this security protection see the
"Auto-loading safe path" section in the GDB manual.  E.g., run from the shell:
    info "(gdb)Auto-loading safe path"
[New Thread 0x7ffff6853700 (LWP 11540)]
[New Thread 0x7ffff5e52700 (LWP 11541)]

CUDA Exception: Device Illegal Address
The exception was triggered in device 0.

Thread 1 "main" received signal CUDA_EXCEPTION_10, Device Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (1,1,0), thread (0,6,0), device 0, sm 6, warp 3, lane 0]
0x0000000000bb72b8 in aligner::kmdist(char*, char*, int*) ()

What I understood from the cuda-gdb output is that, even if cuda-memcheck says the error occurrs when doing de cudaMemcpy, the error is really happening at the function aligner::kmdist(char*, char*, int*) which is called before inside the kernel.
When I tried the function whith just 2 sequences the code works, but its important to mention that it is very very slow, this implementation for those 2 sequences is comparable with the serial implementation when comparing 100 sequences.
It is also worth mentioning that the kmer distance calculated for those 2 sequences changed when  I ran the code with those 2 sequences plus other 3 sequences, for a total of 5 sequence comparisons.
-bash-4.1$ ./main ../prueba2.fasta 3
Elapsed parallel time:8.58585seconds
0.000000  0.002770  
0.000000  0.000000  
-bash-4.1$ ./main ../prueba5.fasta 3
Elapsed parallel time:185.891seconds
0.000000  0.177778  0.000000  0.000000  0.000000  
0.000000  0.000000  0.005540  0.000000  0.000000  
0.000000  0.000000  0.000000  0.023438  0.150875  
0.000000  0.000000  0.000000  0.000000  0.021137  
0.000000  0.000000  0.000000  0.000000  0.000000  

As you can see, the value of 0.002770 changes to 0.177778, at some point I printed the output of the veckm function to see if it was working properly and I noted that in the first example it works fine, the output is the one that is supposed to, but in the second example it isn't, even if the function is the same, so I'm assuming it is related to how I managed the memory, but I'm not sure about what is wrong.
I tried defining a __shared__ array along with __syncthreads() but the output of the program remained the same, I also tried reducing the number of threads per block (that is the reason why  I'm using 16), but originally the code was:
if (len*len > 1024){
    threadsPerBlock.x = 32;
    threadsPerBlock.y = 32;
    blocksPerGrid.x = ceil(double(len)/double(threadsPerBlock.x));
    blocksPerGrid.y = ceil(double(len)/double(threadsPerBlock.y));
  }

The files used to try the program can be found here.
Some extra information:

The program was executed on a cluster that uses slurm. The partition of the cluster that was used has 2 Tesla K40m.

nvcc verison:


-bash-4.1$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130

I know there are a lot of things that can be improved but I want to have it running and functional before I start to optimize it.

Robert Crovella · Accepted Answer

Your in-kernel malloc operation are exceeding the device heap size.

Any time you are having trouble with a CUDA code that uses in kernel malloc or new, it's good practice (at least as a diagnostic) to check the returned pointer value for NULL, before attempting to use (i.e. dereference) it.

When I do that in your code right after the malloc operations in aligner::kmdist, I get the asserts being hit, indicating NULL return values. This is the indication you have exceeded the device heap. You can increase the device heap size.

When I increase the device heap size to 1GB, this particular issue disappears, and at that point cuda-memcheck may start reporting other errors (I don't know, your application may have other defects, but the proximal issue here is exceeding the device heap).

As an aside, I also recommend that you compile your code to match the architecture you are running on:

nvcc -arch=sm_35 -rdc=true ...

It seems there is another problem in your code as well.

From what I can see, strlen() returns the length of the string not including the null terminator. You're using this length to size the array of strings you allocate and pass to the kernel:

  for(int i=0;i


This means you are not allocating space for the null terminator, and you are not copying the null terminator to the device.
But in your device code (aligner::kmdist) you are doing this on these strings:
  while (A[len1] !='\0') {
    len1++;}
  while (B[len2] !='\0') {
    len2++;}

So that design is broken.  (Not sure why you would just pass an array of lengths, but I digress...)
I believe a trivial fix could be:
   slen = strlen(r[i])+1;

A similar problem manifests itself in aligner::compare and aligner::kmdist. (i.e. there may be at least 3 fixes required for this design error).
Your use of double variables for things like size and len also strikes me as bizarre.
With the following modified files,
aligner.cu:
#include "aligner.h"
#include 
__host__ aligner::aligner(){}
__host__ aligner::aligner(string name){
  fileName = name;
}
__host__ aligner::~aligner(){}

__host__ double aligner::length(){
    double counter = 0;
    ifstream file(fileName.c_str());
    string data;
    while(getline(file,data)){
        if(data[0] == '>')counter++;
    }
    noReads =  counter;
    return counter;
}
__host__ void aligner::getReads(){
  reads = new vector(noReads);
  h_reads = new char*[int(noReads)];
  headers =  new vector(noReads);
  ifstream file(fileName.c_str());
  stringstream buffer;
  buffer << file.rdbuf();
  string data,id,seq;
  double indx = 0;
  while (getline(buffer, data)) {
    if(data.empty())
        continue;
    if(data[0] == '>') {
      if(!id.empty())
        {reads->at(indx) = seq;
        h_reads[int(indx)] = (char*)malloc((seq.size()+1)*sizeof(char));
        strcpy(h_reads[int(indx)],seq.c_str());
        h_reads[seq.size()] = '\0';
        headers->at(indx) = id;
        indx++;}
      id = data.substr(1);
      seq.clear();
    }
    else {
      seq += data;
    }
  }
  if(!id.empty()){
    reads->at(indx) = seq;
    h_reads[int(indx)] = (char*)malloc((seq.size()+1)*sizeof(char));
    strcpy(h_reads[int(indx)],seq.c_str());
    h_reads[seq.size()] = '\0';
    headers->at(indx) = id;
  }
}
__device__ void aligner::swapp(char* a,int id1,int id2,int k){
  char c;
  for(int i=0;i b[i]){return false;}
  }
  return false;
}
__device__ void alphsort(char* arr,int size,int k){
  char *tmp1,*tmp2;
  tmp2 = (char*)malloc(k*sizeof(char));
  tmp1 = (char*)malloc(k*sizeof(char));
  for(int i = 0;i<(size*k);i+=k){
    for(int l = 0;l<(size-1)*k;l+=k){
      for(int j = 0;j b[i]){return 1;} //b es mayor
  }
  return 2; //son iguales
}

__device__ double aligner::compare(char* v1 , char* v2,int* k){
  int len1 = 0,len2=0;
  double res = 0;
  int dk = *k;
  while (v1[len1] !='\0') {
    len1++;}
  while (v2[len2] !='\0') {
    len2++;}
  int size1 = len1/dk;
  int size2 = len2/dk;
  alphsort(v1,size1,dk);
  alphsort(v2,size2,dk);
  int i = 0,j=0;
  char *tmp1, *tmp2;
  tmp1 = (char*)malloc(dk*sizeof(char));
  tmp2 = (char*)malloc(dk*sizeof(char));
  int mm;
  while (i

and main.cu:
#include "aligner.h"
#include 
#include 
__global__ void kernel(float* MAT,char** a,int *k,double *len){
  int col = blockIdx.y*blockDim.y+threadIdx.y;
  int row = blockIdx.x*blockDim.x+threadIdx.x;
  int l = int(*len);
  if(col < l && row < l){
    if(col>row){
      MAT[row*l+col] = aligner::kmdist(a[row],a[col],k);
    }
  }
}
void compare(float* MAT,float* HMAT,double size,double len,char* r[],int k);
int main(int argc, char const *argv[]) {
  checkCudaErrors(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1048576ULL*256ULL));
  string file = argv[1];
  int K = atoi(argv[2]);
  aligner objAl(file);
  double len = objAl.length();
  objAl.getReads();
  double size = len*len;
  float* h_mat;
  h_mat = (float*)malloc(int(size*sizeof(float)));
  memset(h_mat,0,int(size*sizeof(float)));
  float* d_mat;
  checkCudaErrors(cudaMalloc((void**)&d_mat,int(size*sizeof(float))));
  checkCudaErrors(cudaMemcpy(d_mat,h_mat,int(size*sizeof(float)),cudaMemcpyHostToDevice));
  compare(d_mat,h_mat,size,len,objAl.h_reads,K);
  //This was used to print the matrix and observe the output when it was small
  /*for(int i = 0;i 256){
    threadsPerBlock.x = 16;
    threadsPerBlock.y = 16;
    blocksPerGrid.x = ceil(double(len)/double(threadsPerBlock.x));
    blocksPerGrid.y = ceil(double(len)/double(threadsPerBlock.y));
  }
  //Take time
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  //Parallel function
  cudaEventRecord(start,0);
  printf("threads x = %d, threads y = %d, block x = %d, block y = %d
", threadsPerBlock.x, threadsPerBlock.y, blocksPerGrid.x, blocksPerGrid.y);
  kernel<<>>(MAT,d_reads,d_k,d_len);
  cudaDeviceSynchronize();
  cudaEventRecord(stop,0);
  cudaEventSynchronize(stop);
  float timer = 0;
  cudaEventElapsedTime(&timer,start,stop);
  cout << "Elapsed parallel time:" << timer/1000 << "seconds" << endl;
  cudaDeviceSynchronize();
  checkCudaErrors(cudaMemcpy(HMAT,MAT,int(size*sizeof(float)),cudaMemcpyDeviceToHost));
  cudaDeviceSynchronize();
}

The code would run for me to completion on a K20X without error:
$ ./main prueba100.fasta 4
threads x = 16, threads y = 16, block x = 7, block y = 7
Elapsed parallel time:678.162seconds

Program hit cudaErrorIllegalAdress without cuda-memcheck error when running program with a large dataset

Answers (1)

Related Questions