CUDA code doesnt work, why?

Question

I have this code, but sometimes It works, sometimes NOT(write printf("ERR:%d ", id)). I work under CUDA 4.1 and have GTS450 which is compute capability 2.1.

The code doesnt have higher purpose, Iam just trying to find why Its not working, because My mind tell me, that right :]

If you want to run it, maybe you need to execute few times, when "error" appear or change grid size!

PS: here you can download exe file for win64 - you need to have cuda4.1 driver

class MAN
{
public:
    int m_id;
    int m_use;

    __device__
    MAN()
    {
        m_id = -1;
        m_use = 0;
    }
};

__device__ int* d_ids = NULL;
__device__ int d_last_ids = 0;

__device__ MAN* d_mans = NULL;


__global__ void init()
{
    d_mans = new MAN[500];  //note: 500 is more than enough!
    d_ids = new int[500];

    for(int i=0; i < 500; i++)
        d_ids[i] = 0;
}


__device__ int getMAN() //every block get unique number, so at one moment all running blocks has different id
{
    while(true)
    {
        for(int i=0; i < 500; i++)
            if(atomicCAS(&(d_mans[i].m_use), 0, 1)==0)
                return i;
    }
}
__device__ void returnMAN(int id)
{
    int s = atomicExch(&(d_mans[id].m_use), 0);
}



__global__ void testIt()
{
    if(threadIdx.x==0)
    {
        int man = getMAN();

        int id = d_mans[man].m_id;
        if(id == -1)    //If It never works with this "id", its creating new
        {
            id = atomicAdd(&d_last_ids, 2);

            d_ids[id] = 10; //set to non-zero
            d_mans[man].m_id = id;  //save new id for next time

            printf("ADD:%d
", id);
        }

        if(d_ids[id]==0)
            printf("ERR:%d
", id); //THIS SHOULD NEVER HAPPEN, BUT BECOMES !!!

        returnMAN(man);
    }
}



int main()
{
    init<<<1, 1>>>();
    printf("init() err: %d
", cudaDeviceSynchronize());

    testIt<<<20000, 512>>>();
    printf("testIt() err: %d
", cudaDeviceSynchronize());

    getchar();
    return 0;
}

Milan · Accepted Answer

I have changed this:

__device__ int* d_ids = NULL;

to this:

__device__ volatile int* d_ids = NULL;

and it works ok!!!

And even It doesnt need __threadfence();

CUDA code doesnt work, why?

Answers (2)

Related Questions