Changseok Ma
Changseok Ma

Reputation: 89

how should I set copy() when using Openacc(parallelization) in C++?

I am using gcc compiler. (g++ -o test testfile.cpp)

I want to use Openacc to parallelize my code but I am a bit confused about using #pragma correctly.

Below is the part where I used parallelization.

Even after using Openacc the code is not faster than before.

I guess this is related to 'data-moving' thing.

So I think I need to use #pragma acc data copy here. But I am not sure how to use this properly.

Any help? Thanks in advance.

#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h>       // power
#include <cmath>        // abs
#include <fstream>

using namespace std;
using namespace chrono;

// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
            fill_n(Mat[i][j], Ny, 1);
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
    int*** Mat = new int** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new int* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new int[Ny];
        }
    }
    *pMat = Mat;
}

// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
    float C;
    C = y + a - a_f / R - (d_f - (1 - delta) * d);
    float result;
    if (C > 0) {
        result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
    }
    else {
        result = -999999;
    }
    return result;
}


int main()
{
    
    float duration;

    // Iteration Parameters
    double tol = 0.000001;
    int itmax = 200;
    int H = 15;

    // Model Parameters and utility function
    double sig = 0.75;
    double beta = 0.95;
    double psi = 0.5;
    double delta = 0.1;
    double R = 1 / beta - 0.00215;

    // =============== 2. Discretizing the state space =========================

    // Size of arrays
    const int Na = 1 * 91;
    const int Nd = 1 * 71;
    const int Ny = 3;

    // Variables for discretization of state space
    const float amin = -2;
    const float amax = 7;
    const float dmin = 0.01;
    const float dmax = 7;
    const float ymin = 0.5;
    const float ymax = 1.5;
    const float Ptrans[3] = { 0.2, 0.6, 0.2 };

    // Discretization of state space
    float ca = (amax - amin) / (Na - 1.0);
    float cd = (dmax - dmin) / (Nd - 1.0);
    float cy = (ymax - ymin) / (Ny - 1.0);

    float* A = new float[Na];
    float* Y = new float[Ny];
    float* D = new float[Nd];

    for (int i = 0; i < Na; i++) {
        A[i] = amin + i * ca;
    }
    for (int i = 0; i < Nd; i++) {
        D[i] = dmin + i * cd;
    }
    for (int i = 0; i < Ny; i++) {
        Y[i] = ymin + i * cy;
    }

    // === 3. Initial guesses, Variable initialization and Transition matrix ===

    // Initial guess for value function
    float*** V;
    dallo_fn(&V, Na, Nd, Ny);
    float*** Vnew;
    dallo_fn(&Vnew, Na, Nd, Ny);

    // Initialization of other variables
    float Val[Na][Nd];
    float** Vfuture = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        Vfuture[i] = new float[Nd];
    }
    float** temphoward = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        temphoward[i] = new float[Nd];
    }

    float*** Vhoward;
    dallo_fn0(&Vhoward, Na, Nd, Ny);
    float*** tempdiff;
    dallo_fn0(&tempdiff, Na, Nd, Ny);
    int*** maxposition_a;
    dallo_fn1(&maxposition_a, Na, Nd, Ny);
    int*** maxposition_d;
    dallo_fn1(&maxposition_d, Na, Nd, Ny);

    float** mg_A_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_A_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_A_v[i][j] = A[i];
        }
    }

    float** mg_D_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_D_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_D_v[i][j] = D[j];
        }
    }

    float***** Uvec = new float**** [Na];
    for (int i = 0; i < Na; i++) {
        Uvec[i] = new float*** [Nd];
        for (int j = 0; j < Nd; j++) {
            Uvec[i][j] = new float** [Ny];
            for (int k = 0; k < Ny; k++) {
                Uvec[i][j][k] = new float* [Na];
                for (int l = 0; l < Na; l++) {
                    Uvec[i][j][k][l] = new float[Nd];
                }
            }
        }
    }

    for (int i = 0; i < Na; i++) {
        for (int j = 0; j < Nd; j++) {
            for (int k = 0; k < Ny; k++) {
                for (int l = 0; l < Na; l++) {
                    for (int m = 0; m < Nd; m++) {
                        Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
                    }
                }
            }
        }
    }

    // Value function iteration
    int it;
    float dif;
    float max;
    it = 0;
    dif = 1;

    // ================ 4. Value function iteration ============================
    
    while (dif >= tol && it <= itmax) {
        system_clock::time_point start = system_clock::now();
        it = it + 1;
        // V = Vnew;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    V[i][j][k] = Vnew[i][j][k];
                }
            }
        }

        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                Vfuture[i][j] = 0;
                for (int k = 0; k < Ny; k++) {
                    Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
                }
            }
        }
        
        #pragma acc kernels
        for (int a = 0; a < Na; a++) {
            for (int b = 0; b < Nd; b++) {
                for (int c = 0; c < Ny; c++) {
                    max = -99999;
                    for (int d = 0; d < Na; d++) {
                        for (int e = 0; e < Nd; e++) {
                            Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
                            if (max < Val[d][e]) {
                                max = Val[d][e];
                                maxposition_a[a][b][c] = d;
                                maxposition_d[a][b][c] = e;
                            }
                        }
                    }
                    Vnew[a][b][c] = max;
                }
            }
        }

        // Howard improvement
        for (int h = 0; h < H; h++) {
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        Vhoward[i][j][k] = Vnew[i][j][k];
                    }
                }
            }

            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
                        Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
                    }
                }
            }
        }


        // Calculate Diff
        dif = -100000;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
                    if (tempdiff[i][j][k] > dif) {
                        dif = tempdiff[i][j][k];
                    }
                }
            }
        }

        system_clock::time_point end = system_clock::now();
        std::chrono::duration<float> sec = end - start;


        cout << dif << endl;
        cout << it << endl;
        cout << sec.count() << endl;
    }

}

Upvotes: 0

Views: 244

Answers (1)

Mat Colgrove
Mat Colgrove

Reputation: 5646

Are you using the flag to enable OpenACC, i.e. "-fopenacc"? If not the OpenACC directives will be ignored.

Note that you'll want to use a newer GNU version, 10.2 preferable, as GNU support for OpenACC has gotten better over the years. I believe their compiler loop dependency analysis is still lacking so will run "kernels" compute regions sequentially on the device. Hence, for now, you'll want to stick to using "parallel" regions. If you do really want to use "kernels", I'd suggest switching to the NVIDIA HPC compilers (full disclosure, I work for NVIDIA)

Now I think the initial problem is just that you're not enabling OpenACC and why it's the same speed. Actually here I'd expect this case to be extremely slow if you tried to offload it. Besides running the "kernels" region sequentially on the device, the data would need to be transferred back and forth between the host and device each timestep.

The optimal strategy is to have a data region outside of the while loop, use an "update" directive when needed to synchronize the device and host copies of the arrays, and then ensure all the compute has been offload to the device.

Since you didn't post a complete reproducer, I can't test this code and hence verify that it's correct. But to give you an idea of this strategy, I modified your code below:

#pragma acc enter data copyin(Vnew[:Na][:Nd][:Ny], Ptrans[:Ny]) \
        create(Vfuture[:Na][Nd], V[:Na][:Nd][:Ny], maxposition_a[:Na][:Nd][:Ny], maxposition_b[:Na][:Nd][:Ny]) \
        create(Vhoward[:Na][:Nd][:Ny]) // add others here as needed

while (dif >= tol && it <= itmax) {
        system_clock::time_point start = system_clock::now();
        it = it + 1;
        // V = Vnew;
        #pragma acc parallel loop collapse(3) default(present)
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    V[i][j][k] = Vnew[i][j][k];
                }
            }
        }

        #pragma acc parallel loop collapse(2) default(present)
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                Vfuture[i][j] = 0;
                for (int k = 0; k < Ny; k++) {
                    Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
                }
            }
        }

        #pragma acc parallel loop collapse(3) default(present)
        for (int a = 0; a < Na; a++) {
            for (int b = 0; b < Nd; b++) {
                for (int c = 0; c < Ny; c++) {
                    max = -99999;
                    for (int d = 0; d < Na; d++) {
                        for (int e = 0; e < Nd; e++) {
                            Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
                            if (max < Val[d][e]) {
                                max = Val[d][e];
                                maxposition_a[a][b][c] = d;
                                maxposition_d[a][b][c] = e;
                            }
                        }
                    }
                    Vnew[a][b][c] = max;
                }
            }
        }

        // Howard improvement
        for (int h = 0; h < H; h++) {
            #pragma acc parallel loop collapse(3) default(present)
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        Vhoward[i][j][k] = Vnew[i][j][k];
                    }
                }
            }

            #pragma acc parallel loop collapse(2) default(present)
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        // I'm unclear why your using a 2D array for temphoward.  It's preventing
                        // parallelzation of the inner loop and could be replaced with a scalar.
                        temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
                        Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
                    }
                }
            }
        }


        // Calculate Diff
        dif = -100000;
        #pragma acc parallel loop collapse(3) reduction(max:dif)
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    // Again, why aren't you using a scalar here for tempdiff?
                    tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
                    if (tempdiff[i][j][k] > dif) {
                        dif = tempdiff[i][j][k];
                    }
                }
            }
        }

        system_clock::time_point end = system_clock::now();
        std::chrono::duration<float> sec = end - start;


        cout << dif << endl;
        cout << it << endl;
        cout << sec.count() << endl;
    }

#pragma acc update self(Vnew[:Na][:Nd][:Ny])
    for (int k = 0; k < Ny; k++) {
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                cout << Vnew[i][j][k];
            }
            cout << '\n';
        }
    }

}

#pragma acc exit data delete(Vnew, Ptrans, Vfuture, V, maxposition_a, maxposition_b, Vhoward)
// add others here as needed

Upvotes: 1

Related Questions