Cuda C++ kernel using nested method calls of different objects

Question

I'm currently working on a program that performs multiple parallel and independent calculations that I decided to implement with Cuda. In detail I want to call a Kernel from inside a method of a class what seems to work. The kernel gets the object from which it is called in its current state using the "this"-pointer being copied to device memory.

It is important to know that this object contains a vector of objects from another class that has been assigned beforehand in the host code. Inside the kernel I want to call a method on every object inside the vector which has already been declared as device code. And this somewhat nested call of methods does not work because, instead of a valid result vector, I get back a vector of zeros which indicates that something does not work properly although everything should be compatible with Cuda so far.

There are two classes:

//Header of Ding-class
#pragma once
#include
#include
#include
#include "device_launch_parameters.h"
#include
#include
#include 
#include "test_klasse.cuh"

class Ding
{
private:
    int index;
    size_t size;
    double* result_vector;
    double prop;
    double result;
    test_klasse* stuff_vector;

public:
    __host__ __device__ Ding(int, size_t);
    __host__ __device__ ~Ding();

    __host__ void calculate_stuff(double);
    __host__ __device__ double get_prop();
    __host__ __device__ double get_result();
    __host__ __device__ double get_value(int);
};

//Source code of Ding-class
#include "Ding.cuh"

__global__ void calculation(Ding* teil, double* result, int size, double coeff) {

    int ii = blockIdx.x * blockDim.x + threadIdx.x;

    if (ii < size){
        result[ii] = (*teil).get_value(ii);
    }
}

__host__ __device__ Ding::Ding(int ind, size_t vec_size) {
    index = ind;
    prop = 1;
    result = 0;
    size = vec_size;

    stuff_vector = (test_klasse*)malloc(size * sizeof(test_klasse));

    result_vector = (double*)malloc(size * sizeof(double));

    for (int ii = 0; ii < size; ii++) {
        result_vector[ii] = 0;
        stuff_vector[ii] = test_klasse::test_klasse(ii, 2 * ii);
    }
}

__host__ __device__ Ding::~Ding() {};

__host__ void Ding::calculate_stuff(double coeff) {
    prop = 1;
    double* d_result;
    int* d_size;
    double sum = 0;
    Ding* d_teil;

    cudaMalloc(&d_teil, sizeof(Ding));
    cudaMalloc(&d_result, size * sizeof(double));
    cudaMalloc(&d_size, sizeof(int));

    cudaMemcpy(d_size, &size, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_teil, this, sizeof(Ding), cudaMemcpyHostToDevice);

    calculation <<< size / 512 + 1, 512 >>> (d_teil, d_result, size, coeff);

    cudaMemcpy(result_vector, d_result, size * sizeof(double), cudaMemcpyDeviceToHost);

    for (int ii = 0; ii < size; ii++) {
        std::cout << result_vector[ii] << std::endl;
        result += result_vector[ii];
    }
}

__host__ __device__ double Ding::get_prop() {
    return prop;
}

__host__ __device__ double Ding::get_result() {
    return result;
}

__host__ __device__ double Ding::get_value(int index) {
    return stuff_vector[index].get_sum();
}

The "inner" class the objects inside the vector "stuff_vector" belong to is defined as follows:

//Header of test_klasse
#pragma once
#include
#include
#include
#include "device_launch_parameters.h"

class test_klasse
{
private:
    int a;
    int b;
public:
    __host__ __device__ test_klasse(int, int);
    __host__ __device__ ~test_klasse();
    __host__ __device__ void add(int);
    __host__ __device__ int get_sum();
};

//Source code of test_klasse
#include "test_klasse.cuh"

__host__ __device__ test_klasse::test_klasse(int input_a, int input_b) {
    a = input_a;
    b = input_b;
}

__host__ __device__ test_klasse::~test_klasse() {}

__host__ __device__ void test_klasse::add(int s) {
    a += s;
    b -= s;
}

__host__ __device__ int test_klasse::get_sum() {
    return a + b;
}

Finally the main function just creates an object of the Ding-class and performs the calculations that again invoke the kernel:

#include
#include
#include "Ding.cuh"
#include

int main() {
    Ding teil(5, 50);
    //std::cout << "Old result: " << std::endl;
    teil.calculate_stuff(3.1234);
    std::cout << "New result: " << teil.get_result() << std::endl;
    return 0;
}

Everything compiles without an error and as long as I just call a method of the Ding-object to assign it to results I get a reasonable results. Things start to fail when I call methods on the members of the "stuff_vector".

Does anybody know what's going wrong or which limitations of the Cuda computation I am currently not aware of?

Cuda C++ kernel using nested method calls of different objects

Answers (1)

Related Questions