writer galaxy
writer galaxy

Reputation: 1

How to cope with "cudaErrorMissingConfiguration" from "cudaMallocPitch" function of CUDA?

I'm making a Mandelbrot set program with CUDA. However I can't step more unless cudaErrorMissingConfiguration from cudaMallocPitch() function of CUDA is to be solved. Could you tell me something about it?

My GPU is GeForce RTX 2060 SUPER.

I'll show you my command lines below.

> nvcc MandelbrotCUDA.cu -o MandelbrotCUDA -O3

I tried cudaDeviceSetLimit( cudaLimitMallocHeapSize, 7*1024*1024*1024 ) to resize heap size.

cudaDeviceSetLimit was success.

However I cannot step one more. I cannot print "CUDA malloc done!"

#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;

#define D 0.0000025 // Tick
#define LIMIT_N 255 
#define INF_NUM 2

#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2 

__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.

    for(int i = 0; i < indexTotalY ; i++){
        for(int j = 0; j < indexTotalX; j++){ 
            thrust::complex<double> z(0.0f, 0.0f);
            n[i][j] = 0;
            for(int ctr=1;  ctr <= LIMIT_N ; ctr++){  
                z = z*z + (*(c[i][j]));
                n[i][j] = n[i][j] + (abs(z) < INF_NUM);
            }
        }
    }
}

int main(){

    // Data Path
    string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
    string fileName = "mandelbrot4.ppm";
    string filename = filePath+fileName;

    //complex<double> c[N][M];
    double xRange[2] = {-0.76, -0.74};
    double yRange[2] = {0.05, 0.1};

    const int indexTotalX = (xRange[1]-xRange[0])/D;
    const int indexTotalY = (yRange[1]-yRange[0])/D;

    thrust::complex<double> **c;
    //c = new complex<double> [N];
    cout << "debug_n" << endl;
    int **n;
    n = new int* [indexTotalY];
    c = new thrust::complex<double> * [indexTotalY];
    for(int i=0;i<indexTotalY;i++){
        n[i] = new int [indexTotalX];
        c[i] = new thrust::complex<double> [indexTotalX];
    }

    cout << "debug_n_end" << endl;

    for(int i = 0; i < indexTotalY; i++){
        for(int j = 0; j < indexTotalX; j++){
            thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
            c[i][j] = tmp;
            //n[i*sqrt(N)+j] = 0;
        }
    }

    // CUDA malloc
    cout << "CUDA malloc initializing..." << endl;  

    int **dN;
    thrust::complex<double> **dC;

    cudaError_t error;


    error = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 7*1024*1024*1024);
    if(error != cudaSuccess){
        cout << "cudaDeviceSetLimit's ERROR CODE = " << error << endl;
        return 0;
    }

    size_t tmpPitch;
    error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
    if(error != cudaSuccess){
        cout << "CUDA ERROR CODE = " << error << endl;
        cout << "indexTotalX = " << indexTotalX << endl;
        cout << "indexTotalY = " << indexTotalY << endl;
        return 0;
    }

    cout << "CUDA malloc done!" << endl;

This is console messages below.

debug_n
debug_n_end
CUDA malloc initializing...
CUDA ERROR CODE = 1
indexTotalX = 8000
indexTotalY = 20000

Upvotes: 0

Views: 201

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 152174

There are several problems here:

int **dN;
...
error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));

The correct type of pointer to use in CUDA allocations is a single pointer:

int *dN;

not a double pointer:

int **dN;

(so your kernel where you are trying pass triple-pointers:

void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.

is almost certainly not going to work, and should not be designed that way, but that is not the question you are asking.)

The pointer is passed to the allocating function by its address:

error = cudaMallocPitch((void **)&dN,

For cudaMallocPitch, only the horizontal requested dimension is scaled by the size of the data element. The allocation height is not scaled this way. Also, I will assume X corresponds to your allocation width, and Y corresponds to your allocation height, so you also have those parameters reversed:

error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));

The cudaLimitMallocHeapSize should not be necessary to set to make any of this work. It applies only to in-kernel allocations. Reserving 7GB on an 8GB card may also cause problems. Until you are sure you need that (it's not needed for what you have shown) I would simply remove that.

$ cat t1488.cu
#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;

#define D 0.0000025 // Tick
#define LIMIT_N 255
#define INF_NUM 2

#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2

__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.

    for(int i = 0; i < indexTotalY ; i++){
        for(int j = 0; j < indexTotalX; j++){
            thrust::complex<double> z(0.0f, 0.0f);
            n[i][j] = 0;
            for(int ctr=1;  ctr <= LIMIT_N ; ctr++){
                z = z*z + (*(c[i][j]));
                n[i][j] = n[i][j] + (abs(z) < INF_NUM);
            }
        }
    }
}

int main(){

    // Data Path
    string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
    string fileName = "mandelbrot4.ppm";
    string filename = filePath+fileName;

    //complex<double> c[N][M];
    double xRange[2] = {-0.76, -0.74};
    double yRange[2] = {0.05, 0.1};

    const int indexTotalX = (xRange[1]-xRange[0])/D;
    const int indexTotalY = (yRange[1]-yRange[0])/D;

    thrust::complex<double> **c;
    //c = new complex<double> [N];
    cout << "debug_n" << endl;
    int **n;
    n = new int* [indexTotalY];
    c = new thrust::complex<double> * [indexTotalY];
    for(int i=0;i<indexTotalY;i++){
        n[i] = new int [indexTotalX];
        c[i] = new thrust::complex<double> [indexTotalX];
    }

    cout << "debug_n_end" << endl;

    for(int i = 0; i < indexTotalY; i++){
        for(int j = 0; j < indexTotalX; j++){
            thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
            c[i][j] = tmp;
            //n[i*sqrt(N)+j] = 0;
        }
    }

    // CUDA malloc
    cout << "CUDA malloc initializing..." << endl;

    int *dN;
    thrust::complex<double> **dC;

    cudaError_t error;


    size_t tmpPitch;
    error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
    if(error != cudaSuccess){
        cout << "CUDA ERROR CODE = " << error << endl;
        cout << "indexTotalX = " << indexTotalX << endl;
        cout << "indexTotalY = " << indexTotalY << endl;
        return 0;
    }

    cout << "CUDA malloc done!" << endl;
}
$ nvcc -o t1488 t1488.cu
t1488.cu(68): warning: variable "dC" was declared but never referenced

$ cuda-memcheck ./t1488
========= CUDA-MEMCHECK
debug_n
debug_n_end
CUDA malloc initializing...
CUDA malloc done!
========= ERROR SUMMARY: 0 errors
$

Upvotes: 1

Related Questions