Reputation: 39
I used opencv to implement data transfer with openacc,but I got error when I compiled the code file.Please take a look at the information below:
#include<queue>
#include <vector>
#include<random>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include<openacc.h>
using namespace std;
using namespace cv;
int main(){
cv::Mat srcImg=cv::imread("/home/usera/images/blue-mountains.jpg");
Mat grayScale(srcImg.rows, srcImg.cols, CV_8UC1, Scalar::all(0));
Mat duplicate(srcImg.rows,srcImg.cols, CV_8UC1,Scalar::all(255) );
if(!srcImg.data){
cout<<"The file is not loaded or does not exist"<<endl;
return -1;
}
for(int i = 0; i < srcImg.rows; i++) {
for(int j = 0; j < srcImg.cols; j++) {
double gray = 0.21 * srcImg.at<cv::Vec3b>(i,j)[0] +
0.72 * srcImg.at<cv::Vec3b>(i,j)[1] +
0.07 * srcImg.at<cv::Vec3b>(i,j)[2];
grayScale.at<uchar>(i,j) = (uchar) gray;
}
}
cout<<"Matrix grayScale :"<<grayScale.rows<<" "<<grayScale.cols<<endl;
int vrows=srcImg.rows;
int vcols=srcImg.cols;
cout<<"Step"<<grayScale.step<<endl;
int b[3][5];
#pragma acc parallel loop copy(b[:3][:5])
for(int i=0;i<3;i++){
#pragma acc loop
for(int j=0;j<5;j++){
b[i][j]=i+j;
}
}
cout<<"b[N-1][M-2] :"<<b[1][1]<<endl;
cout<<"b[N][M] :"<<b[2][4]<<endl;
auto *startaddress=grayScale.data;
cout<<(int)*(startaddress+1)<<endl;
cout<<(int)*(startaddress+2)<<endl;
#pragma acc parallel loop collapse(2)
for(int i=0;i<vrows;i++){
//#pragma acc loop
for(int j=0;j<vcols;j++){
duplicate.at<uchar>(i,j)=10;//grayScale.at<uchar>(i,j);
}
}
cout<<"duplicate"<<": "<<(int)grayScale.at<uchar>(23,45)<<endl;
cout<<"duplicate"<<": "<<(int)duplicate.at<uchar>(23,45)<<endl;
}
The result shows:
Matrix grayScale :810 1440
Step1440
b[N-1][M-2] :2
b[N][M] :6
194
195
duplicate lives at 0x7ffff78db5c0 size 96 partially present
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 8.6, threadid=1
host:0x7ffff78db610 device:0x7f1ba56fa000 size:8 presentcount:1+0 line:129 name:(null)
allocated block device:0x7f1ba56fa000 size:512 thread:1
FATAL ERROR: variable in data clause is partially present on the device: name=duplicate
I have one doubt if there is not enough space for the gang or vectors requested by the object duplicate. Or should the duplicate object be copied first? (#pragma ACC parallel loop copy(duplicate.data[:rows*cols]), but I am not sure which kind of array I should use in order to copy duplicate to a GPU device.
Could anyone provide any hints or suggestions?
Thanks in advance.
Upvotes: 0
Views: 55
Reputation: 39
I used these methods to implement parallelization.
Thanks to Mat for the helpful suggestion.
auto *startaddress=grayScale.data;
#pragma acc enter data copyin(startaddress[0:vrows*vcols])
// #pragma acc enter data copyin(grayScale) attach(grayScale.data)
#pragma acc parallel loop default(present)
for(int i=0;i<vrows*vcols;i++){
startaddress[i]=20;
}
#pragma acc exit data copyout(startaddress[0:vrows*vcols])
auto *srcimage=srcImg.data;
#pragma acc enter data copyin(srcimage[:h*w*3],startaddress[0:vrows*vcols])
#pragma acc parallel loop collapse(2) default(present)
for(int i=0;i<h;i++){
for(int j=0;j<w;j++){
{
srcimage[i*w*3+j*3+0]=20;
srcimage[i*w*3+j*3+1]=69;
srcimage[i*w*3+j*3+2]=120;
double gray=0.21*srcimage[i*w*3+j*3+0] + 0.72*srcimage[i*w*3+j*3+1]+0.07*srcimage[i*w*3+j*3+2];
startaddress[i*w+j]=(uchar)gray;
}
}
#pragma acc exit data
copyout(srcimage[:h*w*3],startaddress[0:vrows*vcols])
Please check the deepcopy topic for your reference. Reference link
code reference
#include <opencv2/opencv.hpp>
#include<queue>
#include <vector>
#include<random>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include<openacc.h>
using namespace std;
using namespace cv;
int main(){
cv::Mat srcImg=cv::imread("/home/usera/images/blue-mountains.jpg");
Mat grayScale(srcImg.rows, srcImg.cols, CV_8UC1, Scalar::all(0));
Mat duplicate(srcImg.rows,srcImg.cols, CV_8UC1,Scalar::all(255) );
if(!srcImg.data){
cout<<"The file is not loaded or does not exist"<<endl;
return -1;
}
for(int i = 0; i < srcImg.rows; i++) {
for(int j = 0; j < srcImg.cols; j++) {
double gray = 0.21 * srcImg.at<cv::Vec3b>(i,j)[0] +
0.72 * srcImg.at<cv::Vec3b>(i,j)[1] +
0.07 * srcImg.at<cv::Vec3b>(i,j)[2];
grayScale.at<uchar>(i,j) = (uchar) gray;
}
}
int h=srcImg.rows;
int w=srcImg.cols;
int ch=srcImg.channels();
int stepsize=srcImg.step;
cout<<"Matrix grayScale :"<<grayScale.rows<<" "<<grayScale.cols<<endl;
int vrows=srcImg.rows;
int vcols=srcImg.cols;
cout<<"Step"<<grayScale.step<<endl;
int b[3][5];
#pragma acc parallel loop copy(b[:3][:5])
for(int i=0;i<3;i++){
#pragma acc loop
for(int j=0;j<5;j++){
b[i][j]=i+j;
}
}
cout<<"b[N-1][M-2] :"<<b[1][1]<<endl;
cout<<"b[N][M] :"<<b[2][4]<<endl;
auto *startaddress=grayScale.data;
cout<<(int)*(startaddress+1)<<endl;
cout<<(int)*(startaddress+2)<<endl;
#pragma acc enter data copyin(startaddress[0:vrows*vcols])
#pragma acc parallel loop default(present)
for(int i=0;i<vrows*vcols;i++){
startaddress[i]=20;
}
#pragma acc exit data copyout(startaddress[0:vrows*vcols])
auto *srcimage=srcImg.data;
#pragma acc enter data copyin(srcimage[:h*w*3],startaddress[0:vrows*vcols])
#pragma acc parallel loop collapse(2) default(present)
for(int i=0;i<h;i++){
for(int j=0;j<w;j++){
{
srcimage[i*w*3+j*3+0]=20;
srcimage[i*w*3+j*3+1]=69;
srcimage[i*w*3+j*3+2]=120;
double gray=0.21*srcimage[i*w*3+j*3+0] + 0.72*srcimage[i*w*3+j*3+1]+0.07*srcimage[i*w*3+j*3+2];
startaddress[i*w+j]=(uchar)gray;
}
}
#pragma acc exit data copyout(srcimage[:h*w*3],startaddress[0:vrows*vcols])
cout<<"srcImge"<<(int)srcimage[3*w+3*26+0]<<endl;
cout<<"srcImge"<<(int)srcimage[2*w+3*12+0]<<endl;
cout<<"srcImge"<<(int)srcimage[12*w+3*88+0]<<endl;
cout<<"srcImge"<<(int)srcimage[2*w+3*12+1]<<endl;
cout<<"startaddress"<<(int)startaddress[12]<<endl;
cout<<"startaddress"<<(int)startaddress[23]<<endl;
cout<<"startaddress"<<(int)startaddress[46]<<endl;
cout<<0.21*srcimage[30*w*3+13*3+0] + 0.72*srcimage[30*w*3+13*3+1]+0.07*srcimage[30*w*3+13*3+2]<<endl;
}
The result shows:
main:
108, Generating copy(b[:][:]) [if not already present]
Generating NVIDIA GPU code
111, #pragma acc loop gang /* blockIdx.x */
113, #pragma acc loop seq
113, Loop is parallelizable
131, Generating enter data copyin(startaddress[:vcols*vrows])
Generating NVIDIA GPU code
153, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
131, Generating default present(startaddress[:vcols*vrows])
161, Generating exit data copyout(startaddress[:vcols*vrows])
219, Generating enter data copyin(startaddress[:vcols*vrows],srcimage[:(w*h)*3])
Generating NVIDIA GPU code
223, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
224, /* blockIdx.x threadIdx.x collapsed */
219, Generating default present(startaddress[:h],srcimage[:])
250, Generating exit data copyout(startaddress[:vcols*vrows],srcimage[:(w*h)*3])
$ ./test4a
Matrix grayScale :810 1440
Step1440
b[N-1][M-2] :2
b[N][M] :6
194
195
-------------------------
20
20
-------------------------
duplicate: 20
duplicate: 255
srcImge20
srcImge20
srcImge20
srcImge69
startaddress62
startaddress62
startaddress62
62.28
Upvotes: 0