Reputation: 49
I'm trying to do differential evolution on CUDA, but the problem is that kernel which is responsible for "Mutation, Crossover, Evaluation, Selection" never gets launched.
Any help?
Here's the entire code:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[D*id+0]=a;
mutation[D*id+1]=b;
mutation[D*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
UPDATE - Solution:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
Upvotes: 0
Views: 254
Reputation: 49
Solution:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
Upvotes: 1