Reputation: 51
The gfortran
compiler gives wrong answer, when I run a parallel program using OpenMP. In the same time, ifort
gives exact result.
This is the whole compilable code.
!_______________________________________________________________ !
!____________MODULE SECTION_____________________________________ !
MODULE MATRIC
IMPLICIT NONE
INTEGER , PARAMETER :: NG = 40
DOUBLE PRECISION,SAVE :: Z , PA , PB ,CMU
DOUBLE PRECISION , PARAMETER :: PI=2.0D0*ACOS(0.0D0) , &
FPI=4.0D0*PI , SQFPI = SQRT(FPI), DLAM=1.0D0
DOUBLE PRECISION , DIMENSION(450) :: DEL1, DEL2, X, R , SNLO
DOUBLE PRECISION :: XG(60) , WG(60)
END MODULE MATRIC
!_________________________________________________________________________!
! MODULE SECTION
!__________________________________________________________________________!
MODULE POTDATA
IMPLICIT NONE
INTEGER :: IA , IB , ID
DOUBLE PRECISION :: RA , RB , R1s(450)
END MODULE POTDATA
!__________________________________________________________________________!
!______________________________________________________________________!
program check
use matric
use potdata
implicit double precision(a-h,o-z)
pa = 0.72D0 ; pb = 0.19D0
mesh = 441 ; noint= 40 ; z = 2.0d0
CALL GAULEG(-1.d0,1.d0)
NB = MESH/NOINT
I = 1
X(I) = 0.0D+00
DELTAX = 0.0025D+00*40.0D+00/DBLE(NOINT)
DO J=1,NB
IMK = (J-1)*NOINT + 1
DO K=1,NOINT
AK=K
I=I+1
X(I)=X(IMK)+AK*DELTAX
END DO
DELTAX=2.0D+00*DELTAX
END DO
CMU=(9.0D00*PI*PI/(128.0D00*Z))**THIRD
R(1)=0.0D+00 ; SNLO(1) = 0.D00
DO I=2,MESH
R(I)=CMU*X(I)
SNLO(I) = R(I)*dexp(-Z*R(I))
R1S(I) = SNLO(I)/(SQFPI*R(I))
END DO
call EFFPOT(MESH,NOINT)
end program check
subroutine EFFPOT(MESH,NOINT)
USE OMP_LIB
USE MATRIC
USE POTDATA
implicit none
integer, intent(in) :: MESH, NOINT
double precision::anorm(450)
double precision, external :: funct
double precision :: asum, fac, cnorm
!$omp parallel do default(none) private(del1,ia,asum,ib,ra,rb,fac) &
!$omp shared(id,mesh,r,anorm,NOINT,del2,R1s)
do ia = 2,mesh
ra = r(ia)
if(R1s(ia).lt.1.D-7.and.R1s(ia).ge.1.D-8)id = ia
do ib = 2,mesh
rb = r(ib)
call QGAUSS(funct,-1.d0,1.d0,fac)
del1(ib) = rb**2*fac*R1s(ib)**2
end do
CALL NCDF(del1,ASUM,r(2),mesh,NOINT)
anorm(ia) = 2.0d0*pi*asum
del2(ia) = 2.0d0*pi*asum*(ra*R1s(ia))**2
end do
!$omp end parallel do
CALL NCDF(del2,ASUM,r(2),mesh,NOINT)
cnorm = 1.0/dsqrt(4.*pi*ASUM)
write(6,*)'cnorm =',cnorm
return
end
double precision function funct(x)
USE POTDATA , ONLY : RA , RB
USE MATRIC , ONLY : PA , PB , DLAM
implicit none
double precision, intent(in) :: x
double precision :: f1, f2, ramrb
ramrb = dsqrt(ra**2+rb**2-2.d0*ra*rb*x)
f1 = dcosh(pa*ra)+dcosh(pa*rb)
f2 = 1.d0+0.5*dlam*ramrb*dexp(-pb*ramrb)
funct = (f1*f2)**2
return
end
SUBROUTINE QGAUSS(func,aa,bb,ss)
USE OMP_LIB
USE MATRIC , ONLY : XG ,WG , NG
IMPLICIT DOUBLE PRECISION(A-H,O-Z)
external func
xm = 0.5d0*(bb+aa)
xl = 0.5d0*(bb-aa)
ss = 0.d0
do j=1,ng
dx = xl*xg(j)
ss = ss + wg(j)*(func(xm+dx)+func(xm-dx))
end do
ss = xl*ss/2.0
return
END
SUBROUTINE GAULEG(x1,x2)
USE MATRIC , ONLY : XG ,WG ,NG , PI
IMPLICIT DOUBLE PRECISION(A-H,O-Z)
eps = 1.d-14
m = (ng+1)/2
xm = 0.5D0*(x1+x2)
xl = 0.5D0*(x2-x1)
do i=1,m
z = dcos(pi*(dfloat(i)-0.25d0)/(dfloat(ng)+0.5d0))
1 continue
p1 = 1.d0
p2 = 0.d0
do j=1,ng
p3 = p2
p2 = p1
p1 = ((2.d0*dfloat(j)-1.d0)*z*p2 &
- (dfloat(j)-1.d0)*p3)/dfloat(j)
end do
pp = dfloat(ng)*(z*p1-p2)/(z*z-1.d0)
z1 = z
z = z1 - p1/pp
if (dabs(z-z1).gt.eps) go to 1
xg(i) = xm - xl*z
xg(ng+1-i) = xm + xl*z
wg(i) = 2.d0*xl/((1.d0-z*z)*pp*pp)
wg(ng+1-i) = wg(i)
end do
return
end
SUBROUTINE NCDF(F,ASUM,H,KKK,NOINT)
IMPLICIT DOUBLE PRECISION (A-H,O-Z)
DIMENSION F(450)
NBLOCK = (KKK-2)/NOINT + 1
C2HO45 = 2.0D+00*H/45.0D+00
ASUM = 0.0D+00
DO J=1,NBLOCK
ISTAR = NOINT*(J-1)+5
IEND = NOINT*J + 1
IEND = MIN0(KKK,IEND)
DO I=ISTAR,IEND,4
ASUM = ASUM + C2HO45*(7.0D+00*(F(I-4)+F(I)) &
+32.0D+00*(F(I-3)+F(I-1)) + 12.0D+00*F(I-2))
END DO
IF(IEND.EQ.KKK) GO TO 4
C2HO45 = 2.0D+00*C2HO45
4 END DO
RETURN
END
Thanks everybody specially @Vladimir who has taken interest in my problem. Finally i got the right answer by removing ra and rb from the module potdata and defined function as funct(x, ra, rb) and then removing ra and rb from the loop. Because i was writing ra, rb then reading their values in the above code so loop was having flow dependence. Now i get exact result from both compiler (which is 8.7933767516) parallelly, sequentially both. Exact way is this
subroutine EFFPOT(MESH,NOINT)
USE OMP_LIB
USE MATRIC
USE POTDATA
implicit none
integer, intent(in) :: MESH, NOINT
double precision::anorm(450)
double precision, external :: funct
double precision :: asum, fac, cnorm
!$omp parallel do default(none) private(del1,ia,asum,ib,fac) &
!$omp shared(id,mesh,r,anorm,NOINT,del2,R1s)
do ia = 2,mesh
if(R1s(ia).lt.1.D-7.and.R1s(ia).ge.1.D-8)id = ia
do ib = 2,mesh
call QGAUSS(funct,-1.d0,1.d0,fac,r(ia),r(ib))
del1(ib) = r(ib)**2*fac*R1s(ib)**2
end do
CALL NCDF(del1,ASUM,r(2),mesh,NOINT)
anorm(ia) = 2.0d0*pi*asum
del2(ia) = 2.0d0*pi*asum*(r(ia)*R1s(ia))**2
end do
!$omp end parallel do
CALL NCDF(del2,ASUM,r(2),mesh,NOINT)
cnorm = 1.0/dsqrt(4.*pi*ASUM)
write(6,*)'cnorm =',cnorm
return
end
double precision function funct(x,ra,rb)
USE MATRIC , ONLY : PA , PB , DLAM
implicit none
double precision, intent(in) :: x, ra, rb
double precision :: f1, f2, ramrb
ramrb = dsqrt(ra**2+rb**2-2.d0*ra*rb*x)
f1 = dcosh(pa*ra)+dcosh(pa*rb)
f2 = 1.d0+0.5*dlam*ramrb*dexp(-pb*ramrb)
funct = (f1*f2)**2
return
end
SUBROUTINE QGAUSS(func,aa,bb,ss,ra,rb)
USE OMP_LIB
USE MATRIC , ONLY : XG ,WG , NG
IMPLICIT DOUBLE PRECISION(A-H,O-Z)
external func
xm = 0.5d0*(bb+aa)
xl = 0.5d0*(bb-aa)
ss = 0.d0
do j=1,ng
dx = xl*xg(j)
ss = ss + wg(j)*(func(xm+dx,ra,rb)+func(xm-dx,ra,rb))
end do
ss = xl*ss/2.0
return
END
Upvotes: 2
Views: 399
Reputation: 74365
The cause of your problem is that the OpenMP standard does not specify what happens if a private
list item is accessed in the region but outside of the construct. See example private.2f
(found on page 135 of the OpenMP standard supplement) for a short version of the same problem.
Specifically, the module variables ra
and rb
are declared private
in the OpenMP parallel region inside EFFPOT
and also accessed from funct
. funct
is in the dynamic scope of the parallel region but (lexically) outside of it and therefore it is not specified whether ra
and rb
referenced by funct
are the original module variables or their private copies (most compilers would go for the original variables).
You have already found one of the solutions. The other one would be to declare ra
and rb
threadprivate
since they are only used to pass data from EFFPOT
to funct
:
MODULE POTDATA
IMPLICIT NONE
INTEGER :: IA , IB , ID
DOUBLE PRECISION :: RA , RB , R1s(450)
!$OMP THREADPRIVATE(RA,RB)
END MODULE POTDATA
You should then also remove ra
and rb
from the list of the private
clause of the parallel region within EFFPOT
.
On some platforms, e.g. OS X, using threadprivate
with GCC (i.e. gfortran
) could be slower than actually passing around the two variables as arguments because of the emulated TLS.
Note that this semantic error is really hard to detect and many OpenMP tools won't actually spot it.
Upvotes: 4
Reputation: 2982
First of all, it is very difficult to say something specific without seeing the actual code. However, I do have some comments on your situation and the conclusions you are drawing.
The fact that your program runs fine both in parallel and sequential execution when compiled with "ifort" doesn't mean that your program is correct. Since compiler bugs leading to programs giving wrong answers are very rare, but on the other hand manual parallel programming is very error-prone, we should assume a problem with the way you parallelized your code. We are probably talking about a race condition.
And no, the problem you are having is not at all unusual. When you have a race condition, it happens often that the sequential execution works everywhere and your parallel execution works in some environments and fails in others. It's even common that your code gives different answers every time you call it (not only depending on the compiler, but on many other factors that can change over time).
What I suggest you should do, is to get a parallel debugger, like for example TotalView that will help you keep track of the various threads and their states. Try to find a simple test environment (as few threads as possible) that fails reliably.
Upvotes: 1