Ander Biguri
Ander Biguri

Reputation: 35525

Variable gets lost after allocating array of structs in cuda

I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc and cudaMemcpys of the variables in the struct from host to device.

A simple version (the real one has various structs and variables/arrays inside) of the struct is:

struct Node {

    float* position;

};

struct Graph{
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};

My problem is that I must be doing something wrong in the memory allocation and copy of the struct. When I copy the variables withing Graph, I can see that they are properly copied (by accessing it in a kernel as in the example below). For example, I can check that graph.nBoundary=3.

However, I can only see this if I do not allocate and copy the memory of Node *. If I do, I get -858993460 instead of 3. Interestingly, Node * is not wrongly allocated, as I can inspect the value of say graph.node[0].pos[0] and it has the correct value.

This only happens with the graph.nBoundary. All the other variables remain with the correct numerical values, but this one gets "wronged" when running the cudaMemcpy of the Node*.

What am I doing wrong and why does this happen? How do I fix it?

Let me know if you need more information.


MCVE:

#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

    float* position;

};

struct Graph{
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(Graph* graph,unsigned int * d_res){
    d_res[0] = graph->nBoundary;

};
int main()
{

    // Generate some fake data on the CPU
    Graph graph;
    graph.node = (Node*)malloc(2 * sizeof(Node));
    graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
    for (int i = 0; i < 3; i++){
        graph.boundary[i] = i + 10;
    }
    graph.nBoundary = 3;
    graph.nNode = 2;
    for (int i = 0; i < 2; i++){
        // They can have different sizes in the original code
        graph.node[i].position = (float*)malloc(3 * sizeof(float));
        graph.node[i].position[0] = 45;
        graph.node[i].position[1] = 1;
        graph.node[i].position[2] = 2;
    }

    // allocate GPU memory
    Graph * d_graph = cudaGraphMalloc(&graph);
    // some dummy variables to test on GPU.
    unsigned int * d_res, *h_res;
    cudaMalloc((void **)&d_res, sizeof(unsigned int));
    h_res = (unsigned int*)malloc(sizeof(unsigned int));

    //Run kernel
    testKernel << <1, 1 >> >(d_graph, d_res);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

    printf("%u\n", graph.nBoundary);
    printf("%d", h_res[0]);

    return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph){
    Graph* outGraph;
    gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));

    //copy constants
    gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));


    // copy boundary
    unsigned int * d_auxboundary, *h_auxboundary;
    h_auxboundary = inGraph->boundary;
    gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
    gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));


    //Create nodes 
    Node * auxnode;
    gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));

    // Crate auxiliary pointers to grab them from host and pass them to device
    float ** d_position, ** h_position;
    d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
    h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));

    for (int i = 0; i < inGraph->nNode; i++){

        // Positions
        h_position[i] = inGraph->node[i].position;
        gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
        gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));

    }
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ////////////// If I comment the following section, nBoundary can be read by the kernel
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////

    gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));



    return outGraph;
}

Upvotes: 1

Views: 459

Answers (1)

sgarizvi
sgarizvi

Reputation: 16816

The problem is in the function cudaGraphMalloc where you are trying to allocate device memory to the members of outGraph which has already been allocated on the device. In process of doing so, you are de-referencing a device pointer on host which is illegal.

To allocate device memory to members of struct type variable which exists on the device, we first have to create a temporary host variable of that struct type, then allocate device memory to its members, and then copy it to the struct which exists on the device.

I have answered a similar question here. Please take a look at it.

The fixed code may look like this:

#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

    float* position;

};

struct Graph {
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(Graph* graph, unsigned int * d_res) {
    d_res[0] = graph->nBoundary;

};
int main()
{

    // Generate some fake data on the CPU
    Graph graph;
    graph.node = (Node*)malloc(2 * sizeof(Node));
    graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
    for (int i = 0; i < 3; i++) {
        graph.boundary[i] = i + 10;
    }
    graph.nBoundary = 3;
    graph.nNode = 2;
    for (int i = 0; i < 2; i++) {
        // They can have different sizes in the original code
        graph.node[i].position = (float*)malloc(3 * sizeof(float));
        graph.node[i].position[0] = 45;
        graph.node[i].position[1] = 1;
        graph.node[i].position[2] = 2;
    }

    // allocate GPU memory
    Graph * d_graph = cudaGraphMalloc(&graph);
    // some dummy variables to test on GPU.
    unsigned int * d_res, *h_res;
    cudaMalloc((void **)&d_res, sizeof(unsigned int));
    h_res = (unsigned int*)malloc(sizeof(unsigned int));

    //Run kernel
    testKernel << <1, 1 >> >(d_graph, d_res);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

    printf("%u\n", graph.nBoundary);
    printf("%u\n", h_res[0]);

    return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph) 
{
    //Create auxiliary Graph variable on host
    Graph temp;

    //copy constants
    temp.nNode = inGraph->nNode;
    temp.nBoundary = inGraph->nBoundary;

    // copy boundary
    gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
    gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));


    //Create nodes 
    size_t nodeBytesTotal = temp.nNode * sizeof(Node);
    gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));

    for (int i = 0; i < temp.nNode; i++)
    {
        //Create auxiliary node on host
        Node auxNodeHost;

        //Allocate device memory to position member of auxillary node
        size_t nodeBytes = 3 * sizeof(float);
        gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
        gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));

        //Copy auxillary host node to device
        Node* dPtr = temp.node + i;
        gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
    }


    Graph* outGraph;
    gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
    gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));

    return outGraph;
}

Be advised that you will have to keep the host copies of internal device pointers (i.e. the auxiliary host variables). This is because you will have to free the device memory later and since you will only have a device copy of Graph in the main code, you won't be able to access its members from the host to call cudaFree on them. In this case the variable Node auxNodeHost (created in each iteration) and Graph temp are those variables.

The above code does not do that and is just for demonstration purpose.

Tested on Windows 10, Visual Studio 2015, CUDA 9.2, NVIDIA Driver 397.44.

Upvotes: 2

Related Questions