Adr
Adr

Reputation: 11

Parallel FOR loop with OpenACC using recursive pointers

I have this parallel FOR loop with OpenMP:

#pragma omp parallel for default(none) private(k,cell) shared(sim,mesh)
    for(i=0;i<mesh->ncells;i++){
        cell=&(mesh->cell[i]);
        for(k=0;k<sim->nvar;k++){
            cell->U_aux[k]=cell->U[k];
            cell->U[k]-=sim->dt*((cell->w2->fL_star[k]-cell->w4->fR_star[k])/cell->dx + (cell->w3->fL_star[k]-cell->w1->fR_star[k])/cell->dy + (cell->w6->fL_star[k]-cell->w5->fR_star[k])/cell->dz);
        }            
    }
}

with cell a pointer to an array of cells in the estructure mesh. and I would like to parallelize it with OpenACC.

The structure sim only includes scalars, but the structures mesh includes 1D arrays of cell and wall

struct t_mesh_{
    int ncells;
    t_cell *cell;
    t_wall *wall;
    // and other variables not used in this function...
};

where the structure cell is defined as:

struct t_cell_{
    double dx,dy,dz;
    double *U,*U_aux;
    t_wall *w1, *w2, *w3, *w4, *w5, *w6;
    // and other variables not used in this function...
};

Note that *U,*U_aux are small 1D arrays but *w1, *w2, *w3, *w4, *w5, *w6 are pointers to the array wall:

struct t_wall_{
    double *fR_star,*fL_star;
    // and other variables not used in this function...
};

where fR_star and fL_star are small 1D arrays.

¿How should I define the OpenACC pragma and manage the memory?

Thanks in advance.

[Edited]

In the script attached below, the creation and allocation of the structures, as well as the pointers used in the loop of interest is shown. Most parts of the code have been omitted, only retaining those mentioned in the question:

//macros omitted

////////////////////////////////////////////////////
//////////////  S T R U C T U R E S  ///////////////
////////////////////////////////////////////////////

typedef struct t_node_ t_node;
typedef struct t_cell_ t_cell; 
typedef struct t_wall_ t_wall;
typedef struct t_mesh_ t_mesh;
typedef struct t_sim_ t_sim;

struct t_node_{
    int id;
    double x,y; 

};

struct t_cell_{
    int id;
    int l,m;
    double *U;  //array of cell-valued variables
    double dx,dy;
    double xc,yc;
    int n1,n2,n3,n4; 
    int w1_id,w2_id,w3_id,w4_id;
    t_wall *w1, *w2, *w3, *w4; //pointers to mesh->wall 
};

struct t_wall_{
    int id;
    double *fR_star,*fL_star; //array of left and right fluxes at each wall (same dimension than U)
    int cellR_id, cellL_id; 
    t_cell *cellR, *cellL; //pointers to the right and left cells of the wall (mesh->cell)
    double nx, ny;

};

struct t_mesh_{
    int xcells, ycells; 
      double dx, dy;
    int ncells; //number of cells
    int nwalls; //number of walls
    int nnodes;
    t_cell *cell; //array of cell structures
    t_wall *wall; //array of wall structures
    t_node *node; //array of node structures
    t_sim *sim;
};

struct t_sim_{
    double dt,t,CFL; 
    double tf, tVolc; 
    int rk_steps; 
    int order; 
    int nvar; //number of variables (dimension of U, fR_star, fL_star...)

};


////////////////////////////////////////////////////
//////  F U N C T I O N   D E F I N I T I O N //////
////////////////////////////////////////////////////


int create_mesh(t_mesh *mesh,t_sim *sim);
void update_cellK1(t_mesh *mesh, t_sim *sim);


////////////////////////////////////////////////////
//////  P R E - P R O C.   F U N C T I O N S ///////
////////////////////////////////////////////////////

int create_mesh(t_mesh *mesh, t_sim *sim){
    int i,l,m,k,aux,p;
    int xcells,ycells; 
    t_cell *cell;
    t_wall *wall;
    t_node *node;
    int semiSt;
    

    mesh->sim=sim;

    //Cells
    xcells=mesh->xcells;
    ycells=mesh->ycells;
    mesh->ncells=xcells*ycells;
    mesh->cell=(t_cell*)malloc(mesh->ncells*sizeof(t_cell));
    cell=mesh->cell;    
      
    //Walls
    mesh->nwalls=2*mesh->ncells+xcells+mesh->ycells;
    mesh->wall=(t_wall*)malloc(mesh->nwalls*sizeof(t_wall));

    wall=mesh->wall;
    for(k=0;k<mesh->nwalls;k++){
        wall[k].id=k;
        wall[k].fR_star=(double*)malloc(sim->nvar*sizeof(double));
        wall[k].fL_star=(double*)malloc(sim->nvar*sizeof(double));
    }

    //Walls and nodes of the cells
    for(m=0;m<ycells-1;m++){
        for(l=0;l<xcells-1;l++){
            
            k=xcells*m+l;
            cell[k].id=k;
            cell[k].l=l;
            cell[k].m=m;
            
            cell[k].w1_id=2*(k)+m;
            // ...

            cell[k].w1=&(mesh->wall[cell[k].w1_id]);  // <------- cells' walls used in function "update_cellK1()" are pointers to mesh->wall 
            cell[k].w2=&(mesh->wall[cell[k].w2_id]);  // <------- cells' walls used in function "update_cellK1()" are pointers to mesh->wall 
            cell[k].w3=&(mesh->wall[cell[k].w3_id]);  // <------- cells' walls used in function "update_cellK1()" are pointers to mesh->wall 
            cell[k].w4=&(mesh->wall[cell[k].w4_id]);  // <------- cells' walls used in function "update_cellK1()" are pointers to mesh->wall 
                  // ...
                    // ...

        }
    }

    //Assigment of wall's neighbour cells 
    for(m=0;m<ycells;m++){
        for(l=0;l<xcells;l++){
            
            k=xcells*m+l;
            cell[k].w1->cellR_id=cell[k].id;
                  // ...

            cell[k].w1->cellR=&(cell[k]);
            cell[k].w4->cellR=&(cell[k]);
            cell[k].w2->cellL=&(cell[k]);
            cell[k].w3->cellL=&(cell[k]);

            //...
                  //other special cases omitted
                  //...

        }
    }

    //Allocation of arrays of variables "U" in cells and walls
    for(k=0;k<mesh->ncells;k++){        
        mesh->cell[k].U    =(double*)malloc(sim->nvar*sizeof(double));
    }
    
    return 1;
}


void update_cellK1(t_mesh *mesh, t_sim *sim){   
    int i,k;
    t_cell *cell;  
#pragma omp parallel for default(none) private(k,cell) shared(sim,mesh)
    for(i=0;i<mesh->ncells;i++){
        cell=&(mesh->cell[i]);
        for(k=0;k<sim->nvar;k++){
            cell->U[k]-=sim->dt*((cell->w2->fL_star[k]-cell->w4->fR_star[k])/cell->dx + (cell->w3->fL_star[k]-cell->w1->fR_star[k])/cell->dy);
        }
    }   
}


////////////////////////////////////////////////////
//////////////////// M A I N ///////////////////////
////////////////////////////////////////////////////

int main(int argc, char * argv[]){
    
    int i, j, k, p;
    t_mesh *mesh;
    t_sim *sim;
    double tf,t;
    int nIt; 
    double timeac; 
      
    omp_set_num_threads(NTHREADS);     

    //Mesh and sim allocation
    mesh=(t_mesh*)malloc(sizeof(t_mesh));
    sim =(t_sim*)malloc(sizeof(t_sim));
    
    ////////////////////////////////////////////////////
    ////////////// P R E - P R O C E S S ///////////////
    ////////////////////////////////////////////////////

    //...
    //variable initialization and file reading omitted
    //cell->dx= ...
    //cell->dy= ...
    //...
      
    create_mesh(mesh,sim); 
    update_initial(mesh); //this function (omitted) assings the initial values of cell[k].U[0] ... cell[k].U[4]

    ////////////////////////////////////////////////////
    ////////////// C A L C U L A T I O N ///////////////
    ////////////////////////////////////////////////////
    tf=sim->tf;
    sim->t=0.0;
    t=0.0;
    while(t<tf){        
        compute_fluxes(mesh,sim); //this function (omitted) computes *fR_star,*fL_star of walls (line 32), which are then used in "update_cellK1()"
        update_dt(mesh,sim);      //this function (omitted) computes sim->dt
        update_cellK1(mesh,sim);                                
        t+=sim->dt; //Time updated
        sim->t=t;          
    }
    
    return 1;

}

Upvotes: 0

Views: 213

Answers (1)

Mat Colgrove
Mat Colgrove

Reputation: 5646

The easiest thing to do is to compile with CUDA Unified Memory assuming you're using either the PGI Compiler (-ta=tesla:managed) or NVIDIA HPC Compiler (-acc -gpu=managed). Provided that the memory is allocated, the CUDA runtime will handle all the data management for you.

Though if you do need to manually manage the data, while tricky for this case, it's not too difficult. If you can provide an example code which show how you create the full structures, I can help show you how to manage the data. Most likely we'll want to use the OpenACC API calls so we can directly manage the device pointers and use either "acc_attach" or "acc_map_data" calls to have the "w" variables point to the correct "wall".

[EDIT]

Below is my first attempt at adding manual data movement. I opted for the easier route of using directives and then attaching the already allocated device data. The problem being that since the code is incomplete (would have been good to have dummy values for initialization), I can't actually test if the code works as expected. Sometimes in these types of codes, we need to opt to use only the API calls so we can have direct control over the pointers.

//macros omitted
#include <stdlib.h>
#include <stdio.h>

#ifdef _OPENACC
#include <openacc.h>
#endif

////////////////////////////////////////////////////
//////////////  S T R U C T U R E S  ///////////////
////////////////////////////////////////////////////

typedef struct t_node_ t_node;
typedef struct t_cell_ t_cell;
typedef struct t_wall_ t_wall;
typedef struct t_mesh_ t_mesh;
typedef struct t_sim_ t_sim;

struct t_node_{
    int id;
    double x,y;

};

struct t_cell_{
    int id;
    int l,m;
    double *U;  //array of cell-valued variables
    double dx,dy;
    double xc,yc;
    int n1,n2,n3,n4;
    int w1_id,w2_id,w3_id,w4_id;
    t_wall *w1, *w2, *w3, *w4; //pointers to mesh->wall
};

struct t_wall_{
    int id;
    double *fR_star,*fL_star; //array of left and right fluxes at each wall (same dimension than U)
    int cellR_id, cellL_id;
    t_cell *cellR, *cellL; //pointers to the right and left cells of the wall (mesh->cell)
    double nx, ny;

};

struct t_mesh_{
    int xcells, ycells;
      double dx, dy;
    int ncells; //number of cells
    int nwalls; //number of walls
    int nnodes;
    t_cell *cell; //array of cell structures
    t_wall *wall; //array of wall structures
    t_node *node; //array of node structures
    t_sim *sim;
};

struct t_sim_{
    double dt,t,CFL;
    double tf, tVolc;
    int rk_steps;
    int order;
    int nvar; //number of variables (dimension of U, fR_star, fL_star...)

};


////////////////////////////////////////////////////
//////  F U N C T I O N   D E F I N I T I O N //////
////////////////////////////////////////////////////


int create_mesh(t_mesh *mesh,t_sim *sim);
void update_cellK1(t_mesh *mesh, t_sim *sim);


////////////////////////////////////////////////////
//////  P R E - P R O C.   F U N C T I O N S ///////
////////////////////////////////////////////////////

int create_mesh(t_mesh *mesh, t_sim *sim){
    int i,l,m,k,aux,p;
    int xcells,ycells;
    t_cell *cell;
    t_wall *wall;
    t_node *node;
    int semiSt;

    mesh->sim=sim;

    //Cells
    xcells=mesh->xcells;
    ycells=mesh->ycells;
    mesh->ncells=xcells*ycells;
    mesh->cell=(t_cell*)malloc(mesh->ncells*sizeof(t_cell));
    cell=mesh->cell;

    //Walls
    mesh->nwalls=2*mesh->ncells+xcells+mesh->ycells;
    mesh->wall=(t_wall*)malloc(mesh->nwalls*sizeof(t_wall));

#ifdef _OPENACC
    acc_attach((void**)&mesh->sim);
#pragma acc update device(mesh->ncells,mesh->nwalls)
#pragma acc enter data create(mesh->cell[:mesh->ncells],mesh->wall[:mesh->nwalls])
#endif

    wall=mesh->wall;
    for(k=0;k<mesh->nwalls;k++){
        wall[k].id=k;
        wall[k].fR_star=(double*)malloc(sim->nvar*sizeof(double));
        wall[k].fL_star=(double*)malloc(sim->nvar*sizeof(double));
#pragma acc enter data create(wall[k].fR_star[:sim->nvar], wall[k].fL_star[:sim->nvar])
    }

    //Walls and nodes of the cells
    for(m=0;m<ycells-1;m++){
        for(l=0;l<xcells-1;l++){

            k=xcells*m+l;
            cell[k].id=k;
            cell[k].l=l;
            cell[k].m=m;

            cell[k].w1_id=2*(k)+m;
            // ...

            cell[k].w1=&(mesh->wall[cell[k].w1_id]);  // <------- cells' walls used in function "update_cellK1()" are pointers to mesh->wall
            cell[k].w2=&(mesh->wall[cell[k].w2_id]);  // <------- cells' walls used in function "update_cellK1()" are pointers to mesh->wall
            cell[k].w3=&(mesh->wall[cell[k].w3_id]);  // <------- cells' walls used in function "update_cellK1()" are pointers to mesh->wall
            cell[k].w4=&(mesh->wall[cell[k].w4_id]);  // <------- cells' walls used in function "update_cellK1()" are pointers to mesh->wall
                  // ...
                    // ...
#ifdef _OPENACC
#pragma acc update device(cell[k:1])
            acc_attach((void**)&cell[k].w1);
            acc_attach((void**)&cell[k].w2);
            acc_attach((void**)&cell[k].w3);
            acc_attach((void**)&cell[k].w4);
#endif
        }
    }

    //Assigment of wall's neighbour cells
    for(m=0;m<ycells;m++){
        for(l=0;l<xcells;l++){

            k=xcells*m+l;
            cell[k].w1->cellR_id=cell[k].id;
                  // ...

            cell[k].w1->cellR=&(cell[k]);
            cell[k].w4->cellR=&(cell[k]);
            cell[k].w2->cellL=&(cell[k]);
            cell[k].w3->cellL=&(cell[k]);

            //...
                  //other special cases omitted
                  //...
#ifdef _OPENACC
#pragma acc update device(cell[k].w1->cellR_id)
            acc_attach((void**)&cell[k].w1->cellR);
            acc_attach((void**)&cell[k].w4->cellR);
            acc_attach((void**)&cell[k].w2->cellL);
            acc_attach((void**)&cell[k].w3->cellL);
#endif

        }
    }

    //Allocation of arrays of variables "U" in cells and walls
    for(k=0;k<mesh->ncells;k++){
        mesh->cell[k].U    =(double*)malloc(sim->nvar*sizeof(double));
#pragma acc enter data create(mesh->cell[k].U[:sim->nvar])
    }

    return 1;
}


void update_cellK1(t_mesh *mesh, t_sim *sim){
    int i,k;
    t_cell *cell;
#pragma acc parallel loop present(mesh,sim) private(cell)
    for(i=0;i<mesh->ncells;i++){
        cell=&(mesh->cell[i]);
        for(k=0;k<sim->nvar;k++){
            cell->U[k]-=sim->dt*((cell->w2->fL_star[k]-cell->w4->fR_star[k])/cell->dx + (cell->w3->fL_star[k]-cell->w1->fR_star[k])/cell->dy);
        }
    }
}


////////////////////////////////////////////////////
//////////////////// M A I N ///////////////////////
////////////////////////////////////////////////////

int main(int argc, char * argv[]){

    int i, j, k, p;
    t_mesh *mesh;
    t_sim *sim;
    double tf,t;
    int nIt;
    double timeac;

    //Mesh and sim allocation
    mesh=(t_mesh*)malloc(sizeof(t_mesh));
    sim =(t_sim*)malloc(sizeof(t_sim));


    ////////////////////////////////////////////////////
    ////////////// P R E - P R O C E S S ///////////////
    ////////////////////////////////////////////////////

    //...
    //variable initialization and file reading omitted
    //cell->dx= ...
    //cell->dy= ...
    //...
#pragma acc enter data copyin(mesh[:1]) create(sim[:1])

    create_mesh(mesh,sim);
//    update_initial(mesh); //this function (omitted) assings the initial values of cell[k].U[0] ... cell[k].U[4]

    ////////////////////////////////////////////////////
    ////////////// C A L C U L A T I O N ///////////////
    ////////////////////////////////////////////////////
    tf=sim->tf;
    sim->t=0.0;
    t=0.0;
//    while(t<tf){
//        compute_fluxes(mesh,sim); //this function (omitted) computes *fR_star,*fL_star of walls (line 32), which are then used in "update_cellK1()"
//        update_dt(mesh,sim);      //this function (omitted) computes sim->dt
        update_cellK1(mesh,sim);
//        t+=sim->dt; //Time updated
//        sim->t=t;
//    }

    return 1;

}

Upvotes: 1

Related Questions