Qova90
Qova90

Reputation: 1

Openacc/openmp :: FATAL ERROR: variable in data clause is partially present on the device: name=Tnew

I am very new to parallel programming. I have been working on a class project and have to implement a hybrid model using openmp and openacc to compute the discretized 2D laplace equation by computing a portion of the rows on cpu and the rest on GPU.

Compilation is successful however I get "FATAL ERROR: variable in data clause is partially present on the device: name=Tnew" errors when i run.

#include <omp.h>
#include <openacc.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <chrono>
#include <iomanip>


using namespace std;


int main(int argc, char *argv[])
{
    //Total size of the grid
    int grid_size = atoi(argv[1]);
    // a variable to determine the row to split the entire grid between CPU and GPU
   int split = atoi(argv[2]);

    double * T = new double[(grid_size+2)*(grid_size+2)];
   double * Tnew = new double[(grid_size+2)*(grid_size+2)];
   double tol = 1e-5;

    int nthreads = atoi(argv[3]);
    omp_set_num_threads(nthreads);

    cout << "Grid size is " << grid_size << "number of threads " << nthreads << endl;
    //Initialize arrays
    for (int i=0; i<grid_size+2; ++i) {
      for (int j=0; j<grid_size+2; ++j) {
         T[i*(grid_size+2) + j] = 0;
            if (0 == i && 0 != j && grid_size+1 != j) { T[i*(grid_size+2) + j] = 100; }
            else if (grid_size+1 == i) T[i*(grid_size+2) + j] = 0;
            else if (0 == j && 0 != i && grid_size+1 != i) { T[i*(grid_size+2) + j] = 75; }
             else if (grid_size+1 == j && 0 != i && grid_size+1 != i) { T[i*(grid_size+2) + j] = 50; }
        }
    }

    //Print out array 
    if (grid_size <= 20) {
        for (int i=0; i<grid_size+2; ++i) {
         for (int j=0; j<grid_size+2; ++j) {
            cout << T[i*(grid_size+2) + j] << '\t';
            }
            cout << endl;
        }
    }


    double calc_time = omp_get_wtime();

    #pragma omp parallel
    {
        int tid = omp_get_thread_num();

        /* Select the last thread to interact with gpu. Push the contents of array T beggining 
        from the split location till the end to the gpu
        */ 

        if(tid==nthreads-1){

            int iteration = 0;
            double error = 1.0;

        // Copy Rows of T begining from a row before split location till end and copyout split location till the end.
     #pragma acc enter data copyin(T[split*(grid_size+2):(grid_size+2)*(grid_size+1)]) create(Tnew[split*(grid_size+2):(grid_size+2)*(grid_size+1)]) 
             while (error > tol && iteration <3000)
            {

                error = 0.0;
                iteration++;


                #pragma acc loop independent reduction(+:error)

                for(int a = split+1; a < grid_size+1; a++){
                    for(int b = 1; b < grid_size+1; b++){
                        Tnew[a*(grid_size+2)+b] = 0.25 * (T[(a-1)*(grid_size+2)+b]
                                        +T[(a+1)*(grid_size+2)+b]
                                        +T[a*(grid_size+2)+(b-1)]
                                        +T[a*(grid_size+2)+(b+1)]);
                        //error = fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]);
                        error = fmaxf(error,fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]));
                    }
                }

               #pragma acc loop independent 

                for(int ai = split+1; ai < grid_size+1; ai++){
                    for(int bi = 1; bi < grid_size + 1; bi++){
                         T[ai*(grid_size+2)+bi] = Tnew[ai*(grid_size+2)+bi];
                    }
                }

                // Update the  gpu's boundary row in main memory
                #pragma acc update self(T[(split+1)*(grid_size+2):((split+1)*(grid_size+2)+ grid_size)])
                // Update the threads boundary row in GPU
                #pragma acc update device(T[(split)*(grid_size+2):(split*(grid_size+2)+ grid_size)])
            }
                #pragma acc exit data copyout(T[(split+1)*(grid_size+2):(grid_size+2)*(grid_size+1)])

            cout << "GPU Portion Completed" <<  iteration << " Iterations" << endl;
        }

        // The first N rows until the split location gets computed by the rest of omp threads
        else 
        {
            double error = 1.0;
            int  iteration = 0;

              while (error > tol && iteration <3000) {
                    error=0;
                #pragma omp for collapse(2) nowait
                //#pragma acc kernels         
                for(int a = 1; a < split+1; a++){
                    for(int b = 1; b < grid_size+1; b++){
                        Tnew[a*(grid_size+2)+b] = 0.25 * (T[(a-1)*(grid_size+2)+b]
                                        +T[(a+1)*(grid_size+2)+b]
                                        +T[a*(grid_size+2)+(b-1)]
                                        +T[a*(grid_size+2)+(b+1)]);

                        error = fmaxf(error,fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]));
                    }
                }

                #pragma omp for collapse(2) nowait
                for(int ai = 1; ai < split+1; ai++){
                    for(int bi = 1; bi < grid_size + 1; bi++){
                        T[ai*(grid_size+2)+bi] = Tnew[ai*(grid_size+2)+bi];
                    }
                }               
            }
        }       
    }

    calc_time = omp_get_wtime() - calc_time;
    cout << "calc time " << calc_time << endl;


    if (grid_size <= 20) {
        for (int i=0; i<grid_size+2; ++i) {
                for (int j=0; j<grid_size+2; ++j) {
                cout << setprecision(5) << T[i*(grid_size+2) + j] << '\t';
            }
            cout << endl;
        }
    }

   delete [] T;
   delete [] Tnew;
}

Below is the message i get when i compile


pgc++ -mp -acc -Minfo mixed_omp_acc.cpp -o omp_acc

main:
      7, include "iostream"
          35, include "iostream"
                4, include "ostream"
                    38, include "ios"
                         44, include "basic_ios.h"
                              53, Parallel region activated
                             128, Parallel region terminated
     64, Generating copyout(T[(grid_size+1)*(split+1):(grid_size+2)*(grid_size+1)]) [if not already present]
         Generating create(iteration) [if not already present]
         Generating copyin(tol) [if not already present]
         Generating create(Tnew[split:(grid_size+2)*(grid_size+1)]) [if not already present]
         Generating copyout(T[(grid_size+2)*split:(grid_size+2)*(grid_size+1)]) [if not already present]
         Generating copyin(error) [if not already present]
     94, Generating update self(T[(grid_size+2)*(split+1):(grid_size+2)*(grid_size+1)])
         Generating update device(T[(grid_size+2)*split:(grid_size+2)*(grid_size+1)])
    106, Parallel loop activated with static block schedule
    114, Barrier
    118, Parallel loop activated with static block schedule
    122, Barrier

Below is the errors i get when i run.

The first argument is the grid size, second is the index of division between openmp and openacc and the third is the number of cpu threads. I tried to assign the last cpu thread to interact with the gpu.

T lives at 0x8cc130 size 3696 partially present
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 7.0, threadid=1
host:0x8cc080 device:0x7f09b3afa000 size:3696 presentcount:0+1 line:69 name:T
host:0x8ccfb0 device:0x7f09b3afb000 size:3696 presentcount:0+1 line:69 name:Tnew
allocated block device:0x7f09b3afa000 size:4096 thread:1
allocated block device:0x7f09b3afb000 size:4096 thread:1
FATAL ERROR: variable in data clause is partially present on the device: name=T
******* mixed_omp_acc.cpp main_1F252L55 line:106

Upvotes: 0

Views: 720

Answers (1)

Mat Colgrove
Mat Colgrove

Reputation: 5646

For OpenACC array shaping syntax in C/C++ is the starting element followed by the number of elements to copy, i.e. "arr[start:length]". Though it appears you're using it as "arr[start:end]" so that when it gets to the update clause, the host T array is too small to hold the results. To fix, update your array shape to use the starting element followed by the number of elements to copy, not the range.

Upvotes: 1

Related Questions