Reputation: 1
I am very new to parallel programming. I have been working on a class project and have to implement a hybrid model using openmp and openacc to compute the discretized 2D laplace equation by computing a portion of the rows on cpu and the rest on GPU.
Compilation is successful however I get "FATAL ERROR: variable in data clause is partially present on the device: name=Tnew" errors when i run.
#include <omp.h>
#include <openacc.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <chrono>
#include <iomanip>
using namespace std;
int main(int argc, char *argv[])
{
//Total size of the grid
int grid_size = atoi(argv[1]);
// a variable to determine the row to split the entire grid between CPU and GPU
int split = atoi(argv[2]);
double * T = new double[(grid_size+2)*(grid_size+2)];
double * Tnew = new double[(grid_size+2)*(grid_size+2)];
double tol = 1e-5;
int nthreads = atoi(argv[3]);
omp_set_num_threads(nthreads);
cout << "Grid size is " << grid_size << "number of threads " << nthreads << endl;
//Initialize arrays
for (int i=0; i<grid_size+2; ++i) {
for (int j=0; j<grid_size+2; ++j) {
T[i*(grid_size+2) + j] = 0;
if (0 == i && 0 != j && grid_size+1 != j) { T[i*(grid_size+2) + j] = 100; }
else if (grid_size+1 == i) T[i*(grid_size+2) + j] = 0;
else if (0 == j && 0 != i && grid_size+1 != i) { T[i*(grid_size+2) + j] = 75; }
else if (grid_size+1 == j && 0 != i && grid_size+1 != i) { T[i*(grid_size+2) + j] = 50; }
}
}
//Print out array
if (grid_size <= 20) {
for (int i=0; i<grid_size+2; ++i) {
for (int j=0; j<grid_size+2; ++j) {
cout << T[i*(grid_size+2) + j] << '\t';
}
cout << endl;
}
}
double calc_time = omp_get_wtime();
#pragma omp parallel
{
int tid = omp_get_thread_num();
/* Select the last thread to interact with gpu. Push the contents of array T beggining
from the split location till the end to the gpu
*/
if(tid==nthreads-1){
int iteration = 0;
double error = 1.0;
// Copy Rows of T begining from a row before split location till end and copyout split location till the end.
#pragma acc enter data copyin(T[split*(grid_size+2):(grid_size+2)*(grid_size+1)]) create(Tnew[split*(grid_size+2):(grid_size+2)*(grid_size+1)])
while (error > tol && iteration <3000)
{
error = 0.0;
iteration++;
#pragma acc loop independent reduction(+:error)
for(int a = split+1; a < grid_size+1; a++){
for(int b = 1; b < grid_size+1; b++){
Tnew[a*(grid_size+2)+b] = 0.25 * (T[(a-1)*(grid_size+2)+b]
+T[(a+1)*(grid_size+2)+b]
+T[a*(grid_size+2)+(b-1)]
+T[a*(grid_size+2)+(b+1)]);
//error = fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]);
error = fmaxf(error,fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]));
}
}
#pragma acc loop independent
for(int ai = split+1; ai < grid_size+1; ai++){
for(int bi = 1; bi < grid_size + 1; bi++){
T[ai*(grid_size+2)+bi] = Tnew[ai*(grid_size+2)+bi];
}
}
// Update the gpu's boundary row in main memory
#pragma acc update self(T[(split+1)*(grid_size+2):((split+1)*(grid_size+2)+ grid_size)])
// Update the threads boundary row in GPU
#pragma acc update device(T[(split)*(grid_size+2):(split*(grid_size+2)+ grid_size)])
}
#pragma acc exit data copyout(T[(split+1)*(grid_size+2):(grid_size+2)*(grid_size+1)])
cout << "GPU Portion Completed" << iteration << " Iterations" << endl;
}
// The first N rows until the split location gets computed by the rest of omp threads
else
{
double error = 1.0;
int iteration = 0;
while (error > tol && iteration <3000) {
error=0;
#pragma omp for collapse(2) nowait
//#pragma acc kernels
for(int a = 1; a < split+1; a++){
for(int b = 1; b < grid_size+1; b++){
Tnew[a*(grid_size+2)+b] = 0.25 * (T[(a-1)*(grid_size+2)+b]
+T[(a+1)*(grid_size+2)+b]
+T[a*(grid_size+2)+(b-1)]
+T[a*(grid_size+2)+(b+1)]);
error = fmaxf(error,fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]));
}
}
#pragma omp for collapse(2) nowait
for(int ai = 1; ai < split+1; ai++){
for(int bi = 1; bi < grid_size + 1; bi++){
T[ai*(grid_size+2)+bi] = Tnew[ai*(grid_size+2)+bi];
}
}
}
}
}
calc_time = omp_get_wtime() - calc_time;
cout << "calc time " << calc_time << endl;
if (grid_size <= 20) {
for (int i=0; i<grid_size+2; ++i) {
for (int j=0; j<grid_size+2; ++j) {
cout << setprecision(5) << T[i*(grid_size+2) + j] << '\t';
}
cout << endl;
}
}
delete [] T;
delete [] Tnew;
}
Below is the message i get when i compile
pgc++ -mp -acc -Minfo mixed_omp_acc.cpp -o omp_acc
main:
7, include "iostream"
35, include "iostream"
4, include "ostream"
38, include "ios"
44, include "basic_ios.h"
53, Parallel region activated
128, Parallel region terminated
64, Generating copyout(T[(grid_size+1)*(split+1):(grid_size+2)*(grid_size+1)]) [if not already present]
Generating create(iteration) [if not already present]
Generating copyin(tol) [if not already present]
Generating create(Tnew[split:(grid_size+2)*(grid_size+1)]) [if not already present]
Generating copyout(T[(grid_size+2)*split:(grid_size+2)*(grid_size+1)]) [if not already present]
Generating copyin(error) [if not already present]
94, Generating update self(T[(grid_size+2)*(split+1):(grid_size+2)*(grid_size+1)])
Generating update device(T[(grid_size+2)*split:(grid_size+2)*(grid_size+1)])
106, Parallel loop activated with static block schedule
114, Barrier
118, Parallel loop activated with static block schedule
122, Barrier
Below is the errors i get when i run.
The first argument is the grid size, second is the index of division between openmp and openacc and the third is the number of cpu threads. I tried to assign the last cpu thread to interact with the gpu.
T lives at 0x8cc130 size 3696 partially present
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 7.0, threadid=1
host:0x8cc080 device:0x7f09b3afa000 size:3696 presentcount:0+1 line:69 name:T
host:0x8ccfb0 device:0x7f09b3afb000 size:3696 presentcount:0+1 line:69 name:Tnew
allocated block device:0x7f09b3afa000 size:4096 thread:1
allocated block device:0x7f09b3afb000 size:4096 thread:1
FATAL ERROR: variable in data clause is partially present on the device: name=T
******* mixed_omp_acc.cpp main_1F252L55 line:106
Upvotes: 0
Views: 720
Reputation: 5646
For OpenACC array shaping syntax in C/C++ is the starting element followed by the number of elements to copy, i.e. "arr[start:length]". Though it appears you're using it as "arr[start:end]" so that when it gets to the update clause, the host T array is too small to hold the results. To fix, update your array shape to use the starting element followed by the number of elements to copy, not the range.
Upvotes: 1