user755974
user755974

Reputation: 49

CUDA DE kernel not launching

I'm trying to do differential evolution on CUDA, but the problem is that kernel which is responsible for "Mutation, Crossover, Evaluation, Selection" never gets launched.

Any help?

Here's the entire code:

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[D*id+0]=a;
        mutation[D*id+1]=b;
        mutation[D*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

UPDATE - Solution:

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[3*id+0]=a;
        mutation[3*id+1]=b;
        mutation[3*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

Upvotes: 0

Views: 254

Answers (1)

user755974
user755974

Reputation: 49

Solution:

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[3*id+0]=a;
        mutation[3*id+1]=b;
        mutation[3*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

Upvotes: 1

Related Questions