Can cuda execution blocks be interrupted during execution?

Question

I am worried about a potential race condition in one of my cuda kernels. I am working on an N-Body simulator for the Barnes Hunt Tree algorithm. The purpose of this kernel is to compute the total mass, and centres of mass for each branch of the tree. I want to "iterate" in reverse order on the containers array because those allocated last are least likely to depend on other child containers, also the first containers in the array likely do depend on later containers.

I am using an atomic counter to track which blocks start first and the first block handles the first few container and so on. What I am worried about is can the execution of a block be temporally suspended until other blocks finish or something like that? This is an issue since say the first block starts and then yields for the others for whatever reason. In this case if the others depend on computation performed by the first block they will loop indefinatly.

__global__ void compute_mass_centers_kernel()
{
    int blockNum = atomicAdd(&dev::block_number, 1);
    int cindex = dev::ncontainers - blockNum * blockDim.x - 1 - threadIdx.x;
    if(cindex < 0)
        return;

    Container& c = dev::containers[cindex];
    int missing_ptrs[8];
    int missing = 0;

    float total_mass = 0.0f;
    double3 com = {0}; 
    for(int i = 0; i < 8; i++)
    {
        if(c[i] > 1)
        {
            Object& o = objat(c[i]);
            total_mass += o.m;
            com.x += (double)o.p.x * o.m;
            com.y += (double)o.p.y * o.m;
            com.z += (double)o.p.z * o.m;
        }
        else if(c[i] < 1)
        {
            missing_ptrs[missing++] = c[i];
        }
    }

    while(missing)
    {
        for(int i = 0; i < missing; i++)
        {
            Container& c2 = ctrat(missing_ptrs[i]);
            if(c2.total_mass >= 0.0f)
            {
                total_mass += c2.total_mass;
                com.x += (double)c2.center_of_mass.x * c2.total_mass;
                com.y += (double)c2.center_of_mass.y * c2.total_mass;
                com.z += (double)c2.center_of_mass.z * c2.total_mass;
                missing_ptrs[i--] = missing_ptrs[--missing];
            }
        }
    }

    c.center_of_mass.x = com.x / total_mass;
    c.center_of_mass.y = com.y / total_mass;
    c.center_of_mass.z = com.z / total_mass;
    c.total_mass = total_mass;
}

void compute_mass_centers()
{
    int threads, blocks;
    cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, compute_mass_centers_kernel, 0, 0);
    cucheck();

    int ncontainers;
    cudaMemcpyFromSymbol(&ncontainers, dev::ncontainers, sizeof(int), 0, cudaMemcpyDeviceToHost);
    cucheck();

    blocks = (ncontainers + (threads - 1)) / threads;

    cudaMemcpyToSymbol(dev::block_number, &ZERO, sizeof(int), 0, cudaMemcpyHostToDevice);
    cucheck();

    compute_mass_centers_kernel<<< blocks, threads >>>();
    cucheck();
}

Can cuda execution blocks be interrupted during execution?

Answers (1)

Related Questions